{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "e1d35417-75f2-49fd-8695-36fb11a1dde3",
   "metadata": {},
   "source": [
    "# Creating"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "b7ec7097-f388-4f9b-af64-a91b718e8a31",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 301,
   "id": "f45361fb-b69f-4061-b4b0-945027fa054b",
   "metadata": {},
   "outputs": [],
   "source": [
    "input_dir = \"./UNSW/Raw\"\n",
    "output_file = \"./UNSW/Raw/UNSW-NB15_preprocessed.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 302,
   "id": "e6c141f0-8bc3-45f3-89c0-4ba25bd63604",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(2540047, 49)\n"
     ]
    }
   ],
   "source": [
    "# Reading of all 4 csv files of UNSW\n",
    "dfs = []\n",
    "for i in range(1, 5):\n",
    "    path = input_dir + f\"/UNSW-NB15_{i}.csv\"  # There are 4 input csv files\n",
    "    dfs.append(pd.read_csv(path, header=None, low_memory=False))\n",
    "all_data = pd.concat(dfs).reset_index(drop=True)\n",
    "\n",
    "# Adding Column names to the CSV file\n",
    "df_col = pd.read_csv(input_dir + \"/NUSW-NB15_features.csv\", encoding=\"ISO-8859-1\")\n",
    "df_col[\"Name\"] = df_col[\"Name\"].apply(lambda x: x.strip().replace(\" \", \"\").lower())\n",
    "all_data.columns = df_col[\"Name\"]\n",
    "print(all_data.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 303,
   "id": "29045af0-1029-4039-89f9-c24dc94e04e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data[\"attack_cat\"] = all_data.attack_cat.fillna(value=\"normal\").apply(lambda x: x.strip().lower())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 304,
   "id": "01379342-f238-493a-afae-7baef566b8a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data[\"attack_cat\"] = all_data[\"attack_cat\"].replace(\"backdoors\", \"backdoor\", regex=True).apply(lambda x: x.strip().lower())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 305,
   "id": "2df00682-b5b3-4f18-b4fb-13a1d79fc4ef",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data[\"service\"] = all_data[\"service\"].apply(lambda x: \"None\" if x == \"-\" else x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 306,
   "id": "076da2e4-b798-4fd5-8a31-42acc4924536",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data[\"ct_ftp_cmd\"] = all_data[\"ct_ftp_cmd\"].apply(lambda x: 0 if x == \" \" else x).astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 307,
   "id": "37431d8e-80c5-43c4-bb45-4c30b739ff03",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data.rename(columns={'srcip': 'source_ip', 'dstip': 'destination_ip', 'sport': 'source_port', 'dsport': 'destination_port', 'proto': 'protocol', 'attack_cat':'attack_label', 'label':'binary_label'}, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 308,
   "id": "fa8451f4-1f01-4caa-a79b-cffb805db970",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data.drop_duplicates(inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 309,
   "id": "8420c0f7-1f75-4898-829c-d76e3a6c1afa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Name</th>\n",
       "      <th>source_ip</th>\n",
       "      <th>source_port</th>\n",
       "      <th>destination_ip</th>\n",
       "      <th>destination_port</th>\n",
       "      <th>protocol</th>\n",
       "      <th>state</th>\n",
       "      <th>dur</th>\n",
       "      <th>sbytes</th>\n",
       "      <th>dbytes</th>\n",
       "      <th>sttl</th>\n",
       "      <th>...</th>\n",
       "      <th>ct_ftp_cmd</th>\n",
       "      <th>ct_srv_src</th>\n",
       "      <th>ct_srv_dst</th>\n",
       "      <th>ct_dst_ltm</th>\n",
       "      <th>ct_src_ltm</th>\n",
       "      <th>ct_src_dport_ltm</th>\n",
       "      <th>ct_dst_sport_ltm</th>\n",
       "      <th>ct_dst_src_ltm</th>\n",
       "      <th>attack_label</th>\n",
       "      <th>binary_label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>1390</td>\n",
       "      <td>149.171.126.6</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.001055</td>\n",
       "      <td>132</td>\n",
       "      <td>164</td>\n",
       "      <td>31</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>33661</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>1024</td>\n",
       "      <td>udp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.036133</td>\n",
       "      <td>528</td>\n",
       "      <td>304</td>\n",
       "      <td>31</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>1464</td>\n",
       "      <td>149.171.126.7</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.001119</td>\n",
       "      <td>146</td>\n",
       "      <td>178</td>\n",
       "      <td>31</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>8</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>3593</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.001209</td>\n",
       "      <td>132</td>\n",
       "      <td>164</td>\n",
       "      <td>31</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>49664</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.001169</td>\n",
       "      <td>146</td>\n",
       "      <td>178</td>\n",
       "      <td>31</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2540039</th>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.564998</td>\n",
       "      <td>14106</td>\n",
       "      <td>772406</td>\n",
       "      <td>31</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2540041</th>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.564998</td>\n",
       "      <td>14106</td>\n",
       "      <td>772406</td>\n",
       "      <td>31</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2540042</th>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>33094</td>\n",
       "      <td>149.171.126.7</td>\n",
       "      <td>43433</td>\n",
       "      <td>tcp</td>\n",
       "      <td>FIN</td>\n",
       "      <td>0.087306</td>\n",
       "      <td>320</td>\n",
       "      <td>1828</td>\n",
       "      <td>31</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2540045</th>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35433</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "      <td>CON</td>\n",
       "      <td>2.200934</td>\n",
       "      <td>3498</td>\n",
       "      <td>166054</td>\n",
       "      <td>31</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2540046</th>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>17293</td>\n",
       "      <td>149.171.126.17</td>\n",
       "      <td>110</td>\n",
       "      <td>tcp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.942984</td>\n",
       "      <td>574</td>\n",
       "      <td>676</td>\n",
       "      <td>62</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>exploits</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2059415 rows × 49 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "Name        source_ip source_port  destination_ip destination_port protocol  \\\n",
       "0          59.166.0.0        1390   149.171.126.6               53      udp   \n",
       "1          59.166.0.0       33661   149.171.126.9             1024      udp   \n",
       "2          59.166.0.6        1464   149.171.126.7               53      udp   \n",
       "3          59.166.0.5        3593   149.171.126.5               53      udp   \n",
       "4          59.166.0.3       49664   149.171.126.0               53      udp   \n",
       "...               ...         ...             ...              ...      ...   \n",
       "2540039    59.166.0.1       38606   149.171.126.9               80      tcp   \n",
       "2540041    59.166.0.1       38606   149.171.126.9               80      tcp   \n",
       "2540042    59.166.0.5       33094   149.171.126.7            43433      tcp   \n",
       "2540045    59.166.0.9       35433   149.171.126.0               80      tcp   \n",
       "2540046  175.45.176.0       17293  149.171.126.17              110      tcp   \n",
       "\n",
       "Name    state       dur  sbytes  dbytes  sttl  ...  ct_ftp_cmd  ct_srv_src  \\\n",
       "0         CON  0.001055     132     164    31  ...           0           3   \n",
       "1         CON  0.036133     528     304    31  ...           0           2   \n",
       "2         CON  0.001119     146     178    31  ...           0          12   \n",
       "3         CON  0.001209     132     164    31  ...           0           6   \n",
       "4         CON  0.001169     146     178    31  ...           0           7   \n",
       "...       ...       ...     ...     ...   ...  ...         ...         ...   \n",
       "2540039   CON  0.564998   14106  772406    31  ...           0           1   \n",
       "2540041   CON  0.564998   14106  772406    31  ...           0           2   \n",
       "2540042   FIN  0.087306     320    1828    31  ...           0           1   \n",
       "2540045   CON  2.200934    3498  166054    31  ...           0           1   \n",
       "2540046   CON  0.942984     574     676    62  ...           0           1   \n",
       "\n",
       "Name     ct_srv_dst ct_dst_ltm  ct_src_ltm  ct_src_dport_ltm  \\\n",
       "0                 7          1           3                 1   \n",
       "1                 4          2           3                 1   \n",
       "2                 8          1           2                 2   \n",
       "3                 9          1           1                 1   \n",
       "4                 9          1           1                 1   \n",
       "...             ...        ...         ...               ...   \n",
       "2540039           1          4           2                 2   \n",
       "2540041           1          4           2                 2   \n",
       "2540042           2          3           3                 1   \n",
       "2540045           1          2           4                 2   \n",
       "2540046           1          2           4                 2   \n",
       "\n",
       "Name     ct_dst_sport_ltm  ct_dst_src_ltm  attack_label  binary_label  \n",
       "0                       1               1        normal             0  \n",
       "1                       1               2        normal             0  \n",
       "2                       1               1        normal             0  \n",
       "3                       1               1        normal             0  \n",
       "4                       1               1        normal             0  \n",
       "...                   ...             ...           ...           ...  \n",
       "2540039                 2               2        normal             0  \n",
       "2540041                 2               2        normal             0  \n",
       "2540042                 1               3        normal             0  \n",
       "2540045                 2               2        normal             0  \n",
       "2540046                 2               2      exploits             1  \n",
       "\n",
       "[2059415 rows x 49 columns]"
      ]
     },
     "execution_count": 309,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 310,
   "id": "635a9dbd-bc31-4d27-8174-dee3ce770492",
   "metadata": {},
   "outputs": [],
   "source": [
    "start_value = 1\n",
    "end_value = start_value + len(all_data)\n",
    "all_data['flow_id'] = range(start_value, end_value)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 311,
   "id": "d6bd7eea-b4d6-4d9e-b2a7-ea6c6f9edc7f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Name</th>\n",
       "      <th>source_ip</th>\n",
       "      <th>source_port</th>\n",
       "      <th>destination_ip</th>\n",
       "      <th>destination_port</th>\n",
       "      <th>protocol</th>\n",
       "      <th>state</th>\n",
       "      <th>dur</th>\n",
       "      <th>sbytes</th>\n",
       "      <th>dbytes</th>\n",
       "      <th>sttl</th>\n",
       "      <th>...</th>\n",
       "      <th>ct_srv_src</th>\n",
       "      <th>ct_srv_dst</th>\n",
       "      <th>ct_dst_ltm</th>\n",
       "      <th>ct_src_ltm</th>\n",
       "      <th>ct_src_dport_ltm</th>\n",
       "      <th>ct_dst_sport_ltm</th>\n",
       "      <th>ct_dst_src_ltm</th>\n",
       "      <th>attack_label</th>\n",
       "      <th>binary_label</th>\n",
       "      <th>flow_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>1390</td>\n",
       "      <td>149.171.126.6</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.001055</td>\n",
       "      <td>132</td>\n",
       "      <td>164</td>\n",
       "      <td>31</td>\n",
       "      <td>...</td>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>33661</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>1024</td>\n",
       "      <td>udp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.036133</td>\n",
       "      <td>528</td>\n",
       "      <td>304</td>\n",
       "      <td>31</td>\n",
       "      <td>...</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>1464</td>\n",
       "      <td>149.171.126.7</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.001119</td>\n",
       "      <td>146</td>\n",
       "      <td>178</td>\n",
       "      <td>31</td>\n",
       "      <td>...</td>\n",
       "      <td>12</td>\n",
       "      <td>8</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>3593</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.001209</td>\n",
       "      <td>132</td>\n",
       "      <td>164</td>\n",
       "      <td>31</td>\n",
       "      <td>...</td>\n",
       "      <td>6</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>49664</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.001169</td>\n",
       "      <td>146</td>\n",
       "      <td>178</td>\n",
       "      <td>31</td>\n",
       "      <td>...</td>\n",
       "      <td>7</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2540039</th>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.564998</td>\n",
       "      <td>14106</td>\n",
       "      <td>772406</td>\n",
       "      <td>31</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "      <td>2059411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2540041</th>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.564998</td>\n",
       "      <td>14106</td>\n",
       "      <td>772406</td>\n",
       "      <td>31</td>\n",
       "      <td>...</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "      <td>2059412</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2540042</th>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>33094</td>\n",
       "      <td>149.171.126.7</td>\n",
       "      <td>43433</td>\n",
       "      <td>tcp</td>\n",
       "      <td>FIN</td>\n",
       "      <td>0.087306</td>\n",
       "      <td>320</td>\n",
       "      <td>1828</td>\n",
       "      <td>31</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "      <td>2059413</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2540045</th>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35433</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "      <td>CON</td>\n",
       "      <td>2.200934</td>\n",
       "      <td>3498</td>\n",
       "      <td>166054</td>\n",
       "      <td>31</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "      <td>2059414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2540046</th>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>17293</td>\n",
       "      <td>149.171.126.17</td>\n",
       "      <td>110</td>\n",
       "      <td>tcp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.942984</td>\n",
       "      <td>574</td>\n",
       "      <td>676</td>\n",
       "      <td>62</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>exploits</td>\n",
       "      <td>1</td>\n",
       "      <td>2059415</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2059415 rows × 50 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "Name        source_ip source_port  destination_ip destination_port protocol  \\\n",
       "0          59.166.0.0        1390   149.171.126.6               53      udp   \n",
       "1          59.166.0.0       33661   149.171.126.9             1024      udp   \n",
       "2          59.166.0.6        1464   149.171.126.7               53      udp   \n",
       "3          59.166.0.5        3593   149.171.126.5               53      udp   \n",
       "4          59.166.0.3       49664   149.171.126.0               53      udp   \n",
       "...               ...         ...             ...              ...      ...   \n",
       "2540039    59.166.0.1       38606   149.171.126.9               80      tcp   \n",
       "2540041    59.166.0.1       38606   149.171.126.9               80      tcp   \n",
       "2540042    59.166.0.5       33094   149.171.126.7            43433      tcp   \n",
       "2540045    59.166.0.9       35433   149.171.126.0               80      tcp   \n",
       "2540046  175.45.176.0       17293  149.171.126.17              110      tcp   \n",
       "\n",
       "Name    state       dur  sbytes  dbytes  sttl  ...  ct_srv_src  ct_srv_dst  \\\n",
       "0         CON  0.001055     132     164    31  ...           3           7   \n",
       "1         CON  0.036133     528     304    31  ...           2           4   \n",
       "2         CON  0.001119     146     178    31  ...          12           8   \n",
       "3         CON  0.001209     132     164    31  ...           6           9   \n",
       "4         CON  0.001169     146     178    31  ...           7           9   \n",
       "...       ...       ...     ...     ...   ...  ...         ...         ...   \n",
       "2540039   CON  0.564998   14106  772406    31  ...           1           1   \n",
       "2540041   CON  0.564998   14106  772406    31  ...           2           1   \n",
       "2540042   FIN  0.087306     320    1828    31  ...           1           2   \n",
       "2540045   CON  2.200934    3498  166054    31  ...           1           1   \n",
       "2540046   CON  0.942984     574     676    62  ...           1           1   \n",
       "\n",
       "Name     ct_dst_ltm ct_src_ltm  ct_src_dport_ltm  ct_dst_sport_ltm  \\\n",
       "0                 1          3                 1                 1   \n",
       "1                 2          3                 1                 1   \n",
       "2                 1          2                 2                 1   \n",
       "3                 1          1                 1                 1   \n",
       "4                 1          1                 1                 1   \n",
       "...             ...        ...               ...               ...   \n",
       "2540039           4          2                 2                 2   \n",
       "2540041           4          2                 2                 2   \n",
       "2540042           3          3                 1                 1   \n",
       "2540045           2          4                 2                 2   \n",
       "2540046           2          4                 2                 2   \n",
       "\n",
       "Name     ct_dst_src_ltm  attack_label  binary_label  flow_id  \n",
       "0                     1        normal             0        1  \n",
       "1                     2        normal             0        2  \n",
       "2                     1        normal             0        3  \n",
       "3                     1        normal             0        4  \n",
       "4                     1        normal             0        5  \n",
       "...                 ...           ...           ...      ...  \n",
       "2540039               2        normal             0  2059411  \n",
       "2540041               2        normal             0  2059412  \n",
       "2540042               3        normal             0  2059413  \n",
       "2540045               2        normal             0  2059414  \n",
       "2540046               2      exploits             1  2059415  \n",
       "\n",
       "[2059415 rows x 50 columns]"
      ]
     },
     "execution_count": 311,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 312,
   "id": "95b4dd91-6640-4452-8f5d-ee4652043ada",
   "metadata": {},
   "outputs": [],
   "source": [
    "flow_id = all_data.pop('flow_id')\n",
    "all_data.insert(0, 'flow_id', flow_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 313,
   "id": "32974ed3-8aeb-45db-8cd8-a3b6aed77f9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data.reset_index(drop=True, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 314,
   "id": "9eb3aa54-2a51-4fab-bf27-659c30207c5a",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data.rename_axis(None, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 315,
   "id": "a8145f37-508d-42ef-a9da-8246952c7a12",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Name</th>\n",
       "      <th>flow_id</th>\n",
       "      <th>source_ip</th>\n",
       "      <th>source_port</th>\n",
       "      <th>destination_ip</th>\n",
       "      <th>destination_port</th>\n",
       "      <th>protocol</th>\n",
       "      <th>state</th>\n",
       "      <th>dur</th>\n",
       "      <th>sbytes</th>\n",
       "      <th>dbytes</th>\n",
       "      <th>...</th>\n",
       "      <th>ct_ftp_cmd</th>\n",
       "      <th>ct_srv_src</th>\n",
       "      <th>ct_srv_dst</th>\n",
       "      <th>ct_dst_ltm</th>\n",
       "      <th>ct_src_ltm</th>\n",
       "      <th>ct_src_dport_ltm</th>\n",
       "      <th>ct_dst_sport_ltm</th>\n",
       "      <th>ct_dst_src_ltm</th>\n",
       "      <th>attack_label</th>\n",
       "      <th>binary_label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>1390</td>\n",
       "      <td>149.171.126.6</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.001055</td>\n",
       "      <td>132</td>\n",
       "      <td>164</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>33661</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>1024</td>\n",
       "      <td>udp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.036133</td>\n",
       "      <td>528</td>\n",
       "      <td>304</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>1464</td>\n",
       "      <td>149.171.126.7</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.001119</td>\n",
       "      <td>146</td>\n",
       "      <td>178</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>8</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>3593</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.001209</td>\n",
       "      <td>132</td>\n",
       "      <td>164</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>49664</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.001169</td>\n",
       "      <td>146</td>\n",
       "      <td>178</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059410</th>\n",
       "      <td>2059411</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.564998</td>\n",
       "      <td>14106</td>\n",
       "      <td>772406</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059411</th>\n",
       "      <td>2059412</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.564998</td>\n",
       "      <td>14106</td>\n",
       "      <td>772406</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059412</th>\n",
       "      <td>2059413</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>33094</td>\n",
       "      <td>149.171.126.7</td>\n",
       "      <td>43433</td>\n",
       "      <td>tcp</td>\n",
       "      <td>FIN</td>\n",
       "      <td>0.087306</td>\n",
       "      <td>320</td>\n",
       "      <td>1828</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059413</th>\n",
       "      <td>2059414</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35433</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "      <td>CON</td>\n",
       "      <td>2.200934</td>\n",
       "      <td>3498</td>\n",
       "      <td>166054</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059414</th>\n",
       "      <td>2059415</td>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>17293</td>\n",
       "      <td>149.171.126.17</td>\n",
       "      <td>110</td>\n",
       "      <td>tcp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.942984</td>\n",
       "      <td>574</td>\n",
       "      <td>676</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>exploits</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2059415 rows × 50 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "Name     flow_id     source_ip source_port  destination_ip destination_port  \\\n",
       "0              1    59.166.0.0        1390   149.171.126.6               53   \n",
       "1              2    59.166.0.0       33661   149.171.126.9             1024   \n",
       "2              3    59.166.0.6        1464   149.171.126.7               53   \n",
       "3              4    59.166.0.5        3593   149.171.126.5               53   \n",
       "4              5    59.166.0.3       49664   149.171.126.0               53   \n",
       "...          ...           ...         ...             ...              ...   \n",
       "2059410  2059411    59.166.0.1       38606   149.171.126.9               80   \n",
       "2059411  2059412    59.166.0.1       38606   149.171.126.9               80   \n",
       "2059412  2059413    59.166.0.5       33094   149.171.126.7            43433   \n",
       "2059413  2059414    59.166.0.9       35433   149.171.126.0               80   \n",
       "2059414  2059415  175.45.176.0       17293  149.171.126.17              110   \n",
       "\n",
       "Name    protocol state       dur  sbytes  dbytes  ...  ct_ftp_cmd  ct_srv_src  \\\n",
       "0            udp   CON  0.001055     132     164  ...           0           3   \n",
       "1            udp   CON  0.036133     528     304  ...           0           2   \n",
       "2            udp   CON  0.001119     146     178  ...           0          12   \n",
       "3            udp   CON  0.001209     132     164  ...           0           6   \n",
       "4            udp   CON  0.001169     146     178  ...           0           7   \n",
       "...          ...   ...       ...     ...     ...  ...         ...         ...   \n",
       "2059410      tcp   CON  0.564998   14106  772406  ...           0           1   \n",
       "2059411      tcp   CON  0.564998   14106  772406  ...           0           2   \n",
       "2059412      tcp   FIN  0.087306     320    1828  ...           0           1   \n",
       "2059413      tcp   CON  2.200934    3498  166054  ...           0           1   \n",
       "2059414      tcp   CON  0.942984     574     676  ...           0           1   \n",
       "\n",
       "Name     ct_srv_dst  ct_dst_ltm ct_src_ltm  ct_src_dport_ltm  \\\n",
       "0                 7           1          3                 1   \n",
       "1                 4           2          3                 1   \n",
       "2                 8           1          2                 2   \n",
       "3                 9           1          1                 1   \n",
       "4                 9           1          1                 1   \n",
       "...             ...         ...        ...               ...   \n",
       "2059410           1           4          2                 2   \n",
       "2059411           1           4          2                 2   \n",
       "2059412           2           3          3                 1   \n",
       "2059413           1           2          4                 2   \n",
       "2059414           1           2          4                 2   \n",
       "\n",
       "Name     ct_dst_sport_ltm  ct_dst_src_ltm  attack_label  binary_label  \n",
       "0                       1               1        normal             0  \n",
       "1                       1               2        normal             0  \n",
       "2                       1               1        normal             0  \n",
       "3                       1               1        normal             0  \n",
       "4                       1               1        normal             0  \n",
       "...                   ...             ...           ...           ...  \n",
       "2059410                 2               2        normal             0  \n",
       "2059411                 2               2        normal             0  \n",
       "2059412                 1               3        normal             0  \n",
       "2059413                 2               2        normal             0  \n",
       "2059414                 2               2      exploits             1  \n",
       "\n",
       "[2059415 rows x 50 columns]"
      ]
     },
     "execution_count": 315,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 316,
   "id": "6c589dbb-e57a-46f3-a0ff-bd9d1e13e04e",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data.to_csv('./UNSW/Export/UNSW_Flow.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "377cb6a2-b408-4d5f-87af-867d86c801c2",
   "metadata": {},
   "source": [
    "# Processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "718cbaca-41de-433a-8fc3-b5fbee528a9f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_3115979/4095406579.py:1: DtypeWarning: Columns (32,33,44,45,46,48,51,56,59,64,65,66,67,68,69,71,72,73,74,75,76,77,83,84,87,88,89,90,91,92,93,94,96,97,98,99,100,101,102,103,104,105,106,107,109,110,111,115,116,117,118,120,121,122,123,124,125,126,131,136,137,138,139,141,142,145,146,148,149,150,151,153,154,155,157,158,160,161,162,164,166,167,168,169,171,173,174,175,177,178,180,182,183,185,186,188,191,193,195,197,200,203,206,209,211,214,217,219,220,221,222,224,227,228,230,233,234,235,238,240,241,242,243,244,245,246,247,248,249,250,251,254,255,258,259,262,263,264,265,266,267,268,272,273,277,279,282,283,284,285,287,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,309,311,314,317,320,323,326,327,328,329,330,345,350,351,352,353,356,361,364,365,368,370,372,373,374,375,382,383,386,388,390,391,393,394,396) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df = pd.read_csv('./UNSW/output11.csv')\n"
     ]
    }
   ],
   "source": [
    "df = pd.read_csv('./UNSW/output11.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ef8045a7-12fd-4c27-a82f-87fa3199ef3d",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.rename(columns={'packet': 'packet_hex', 'payload': 'payload_hex', 'srcip': 'source_ip', 'dstip': 'destination_ip', 'sport': 'source_port', 'dsport': 'destination_port', 'protocol_m': 'protocol'}, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e7adef88-445e-40dc-8631-d71f191c4d72",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>source_ip</th>\n",
       "      <th>source_port</th>\n",
       "      <th>destination_ip</th>\n",
       "      <th>destination_port</th>\n",
       "      <th>protocol</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet_hex</th>\n",
       "      <th>...</th>\n",
       "      <th>DNS labels</th>\n",
       "      <th>DNS inception</th>\n",
       "      <th>DNS keytag</th>\n",
       "      <th>DNS signature</th>\n",
       "      <th>DNS nextname</th>\n",
       "      <th>TFTP Ack block</th>\n",
       "      <th>TFTP_Options oname</th>\n",
       "      <th>TFTP_Options value</th>\n",
       "      <th>TFTP Data block</th>\n",
       "      <th>LDAP present</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.424225e+09</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>143.0</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35632.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>180.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a5776300000800450000b4256940...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.424225e+09</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>143.0</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35632.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000034256840...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.424225e+09</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>143.0</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35632.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000034256840...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.424225e+09</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>143.0</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35632.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>73.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000049256b40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.424225e+09</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>143.0</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35632.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>73.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000049256b40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999995</th>\n",
       "      <td>1.424230e+09</td>\n",
       "      <td>59.166.0.2</td>\n",
       "      <td>32339.0</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>6881.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac5000000000008004500003428f240...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999996</th>\n",
       "      <td>1.424230e+09</td>\n",
       "      <td>59.166.0.2</td>\n",
       "      <td>32339.0</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>6881.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac5000000000008004500003428fe40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999997</th>\n",
       "      <td>1.424230e+09</td>\n",
       "      <td>59.166.0.2</td>\n",
       "      <td>32339.0</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>6881.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>31.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c2000008004500003428fe40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999998</th>\n",
       "      <td>1.424230e+09</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>43805.0</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>143.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac50000000000080045000034865d40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999999</th>\n",
       "      <td>1.424230e+09</td>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>18965.0</td>\n",
       "      <td>149.171.126.6</td>\n",
       "      <td>80.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>31.0</td>\n",
       "      <td>285.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c2000008004500011d5f9840...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10000000 rows × 397 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                stime      source_ip  source_port destination_ip  \\\n",
       "0        1.424225e+09  149.171.126.1        143.0     59.166.0.9   \n",
       "1        1.424225e+09  149.171.126.1        143.0     59.166.0.9   \n",
       "2        1.424225e+09  149.171.126.1        143.0     59.166.0.9   \n",
       "3        1.424225e+09  149.171.126.1        143.0     59.166.0.9   \n",
       "4        1.424225e+09  149.171.126.1        143.0     59.166.0.9   \n",
       "...               ...            ...          ...            ...   \n",
       "9999995  1.424230e+09     59.166.0.2      32339.0  149.171.126.5   \n",
       "9999996  1.424230e+09     59.166.0.2      32339.0  149.171.126.5   \n",
       "9999997  1.424230e+09     59.166.0.2      32339.0  149.171.126.5   \n",
       "9999998  1.424230e+09     59.166.0.3      43805.0  149.171.126.9   \n",
       "9999999  1.424230e+09     59.166.0.6      18965.0  149.171.126.6   \n",
       "\n",
       "         destination_port protocol  sttl  total_len   first_layer  \\\n",
       "0                 35632.0      tcp  29.0      180.0  cooked linux   \n",
       "1                 35632.0      tcp  30.0       52.0  cooked linux   \n",
       "2                 35632.0      tcp  29.0       52.0  cooked linux   \n",
       "3                 35632.0      tcp  30.0       73.0  cooked linux   \n",
       "4                 35632.0      tcp  29.0       73.0  cooked linux   \n",
       "...                   ...      ...   ...        ...           ...   \n",
       "9999995            6881.0      tcp  32.0       52.0  cooked linux   \n",
       "9999996            6881.0      tcp  32.0       52.0  cooked linux   \n",
       "9999997            6881.0      tcp  31.0       52.0  cooked linux   \n",
       "9999998             143.0      tcp  32.0       52.0  cooked linux   \n",
       "9999999              80.0      tcp  31.0      285.0  cooked linux   \n",
       "\n",
       "                                                packet_hex  ... DNS labels  \\\n",
       "0        000400010006005056a5776300000800450000b4256940...  ...        NaN   \n",
       "1        000000010006001b17059e1c0000080045000034256840...  ...        NaN   \n",
       "2        000400010006005056a577630000080045000034256840...  ...        NaN   \n",
       "3        000000010006001b17059e1c0000080045000049256b40...  ...        NaN   \n",
       "4        000400010006005056a577630000080045000049256b40...  ...        NaN   \n",
       "...                                                    ...  ...        ...   \n",
       "9999995  000000010006021ac5000000000008004500003428f240...  ...        NaN   \n",
       "9999996  000000010006021ac5000000000008004500003428fe40...  ...        NaN   \n",
       "9999997  000400010006005056a524c2000008004500003428fe40...  ...        NaN   \n",
       "9999998  000000010006021ac50000000000080045000034865d40...  ...        NaN   \n",
       "9999999  000400010006005056a524c2000008004500011d5f9840...  ...        NaN   \n",
       "\n",
       "         DNS inception  DNS keytag DNS signature  DNS nextname  \\\n",
       "0                  NaN         NaN           NaN           NaN   \n",
       "1                  NaN         NaN           NaN           NaN   \n",
       "2                  NaN         NaN           NaN           NaN   \n",
       "3                  NaN         NaN           NaN           NaN   \n",
       "4                  NaN         NaN           NaN           NaN   \n",
       "...                ...         ...           ...           ...   \n",
       "9999995            NaN         NaN           NaN           NaN   \n",
       "9999996            NaN         NaN           NaN           NaN   \n",
       "9999997            NaN         NaN           NaN           NaN   \n",
       "9999998            NaN         NaN           NaN           NaN   \n",
       "9999999            NaN         NaN           NaN           NaN   \n",
       "\n",
       "         TFTP Ack block TFTP_Options oname  TFTP_Options value  \\\n",
       "0                   NaN                NaN                 NaN   \n",
       "1                   NaN                NaN                 NaN   \n",
       "2                   NaN                NaN                 NaN   \n",
       "3                   NaN                NaN                 NaN   \n",
       "4                   NaN                NaN                 NaN   \n",
       "...                 ...                ...                 ...   \n",
       "9999995             NaN                NaN                 NaN   \n",
       "9999996             NaN                NaN                 NaN   \n",
       "9999997             NaN                NaN                 NaN   \n",
       "9999998             NaN                NaN                 NaN   \n",
       "9999999             NaN                NaN                 NaN   \n",
       "\n",
       "        TFTP Data block LDAP present  \n",
       "0                   NaN          NaN  \n",
       "1                   NaN          NaN  \n",
       "2                   NaN          NaN  \n",
       "3                   NaN          NaN  \n",
       "4                   NaN          NaN  \n",
       "...                 ...          ...  \n",
       "9999995             NaN          NaN  \n",
       "9999996             NaN          NaN  \n",
       "9999997             NaN          NaN  \n",
       "9999998             NaN          NaN  \n",
       "9999999             NaN          NaN  \n",
       "\n",
       "[10000000 rows x 397 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "311989a3-8be5-4400-988e-6cd68e213fd6",
   "metadata": {},
   "outputs": [],
   "source": [
    "flow1 = pd.read_csv('./UNSW/Export/UNSW_Flow.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "0c6ba6a7-afc1-4854-9aff-0bb838b355c6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>flow_id</th>\n",
       "      <th>source_ip</th>\n",
       "      <th>source_port</th>\n",
       "      <th>destination_ip</th>\n",
       "      <th>destination_port</th>\n",
       "      <th>protocol</th>\n",
       "      <th>state</th>\n",
       "      <th>dur</th>\n",
       "      <th>sbytes</th>\n",
       "      <th>dbytes</th>\n",
       "      <th>...</th>\n",
       "      <th>ct_ftp_cmd</th>\n",
       "      <th>ct_srv_src</th>\n",
       "      <th>ct_srv_dst</th>\n",
       "      <th>ct_dst_ltm</th>\n",
       "      <th>ct_src_ltm</th>\n",
       "      <th>ct_src_dport_ltm</th>\n",
       "      <th>ct_dst_sport_ltm</th>\n",
       "      <th>ct_dst_src_ltm</th>\n",
       "      <th>attack_label</th>\n",
       "      <th>binary_label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>1390</td>\n",
       "      <td>149.171.126.6</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.001055</td>\n",
       "      <td>132</td>\n",
       "      <td>164</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>33661</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>1024</td>\n",
       "      <td>udp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.036133</td>\n",
       "      <td>528</td>\n",
       "      <td>304</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>1464</td>\n",
       "      <td>149.171.126.7</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.001119</td>\n",
       "      <td>146</td>\n",
       "      <td>178</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>8</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>3593</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.001209</td>\n",
       "      <td>132</td>\n",
       "      <td>164</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>49664</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.001169</td>\n",
       "      <td>146</td>\n",
       "      <td>178</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059410</th>\n",
       "      <td>2059411</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.564998</td>\n",
       "      <td>14106</td>\n",
       "      <td>772406</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059411</th>\n",
       "      <td>2059412</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.564998</td>\n",
       "      <td>14106</td>\n",
       "      <td>772406</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059412</th>\n",
       "      <td>2059413</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>33094</td>\n",
       "      <td>149.171.126.7</td>\n",
       "      <td>43433</td>\n",
       "      <td>tcp</td>\n",
       "      <td>FIN</td>\n",
       "      <td>0.087306</td>\n",
       "      <td>320</td>\n",
       "      <td>1828</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059413</th>\n",
       "      <td>2059414</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35433</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "      <td>CON</td>\n",
       "      <td>2.200934</td>\n",
       "      <td>3498</td>\n",
       "      <td>166054</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059414</th>\n",
       "      <td>2059415</td>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>17293</td>\n",
       "      <td>149.171.126.17</td>\n",
       "      <td>110</td>\n",
       "      <td>tcp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.942984</td>\n",
       "      <td>574</td>\n",
       "      <td>676</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>exploits</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2059415 rows × 50 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         flow_id     source_ip  source_port  destination_ip  destination_port  \\\n",
       "0              1    59.166.0.0         1390   149.171.126.6                53   \n",
       "1              2    59.166.0.0        33661   149.171.126.9              1024   \n",
       "2              3    59.166.0.6         1464   149.171.126.7                53   \n",
       "3              4    59.166.0.5         3593   149.171.126.5                53   \n",
       "4              5    59.166.0.3        49664   149.171.126.0                53   \n",
       "...          ...           ...          ...             ...               ...   \n",
       "2059410  2059411    59.166.0.1        38606   149.171.126.9                80   \n",
       "2059411  2059412    59.166.0.1        38606   149.171.126.9                80   \n",
       "2059412  2059413    59.166.0.5        33094   149.171.126.7             43433   \n",
       "2059413  2059414    59.166.0.9        35433   149.171.126.0                80   \n",
       "2059414  2059415  175.45.176.0        17293  149.171.126.17               110   \n",
       "\n",
       "        protocol state       dur  sbytes  dbytes  ...  ct_ftp_cmd  ct_srv_src  \\\n",
       "0            udp   CON  0.001055     132     164  ...           0           3   \n",
       "1            udp   CON  0.036133     528     304  ...           0           2   \n",
       "2            udp   CON  0.001119     146     178  ...           0          12   \n",
       "3            udp   CON  0.001209     132     164  ...           0           6   \n",
       "4            udp   CON  0.001169     146     178  ...           0           7   \n",
       "...          ...   ...       ...     ...     ...  ...         ...         ...   \n",
       "2059410      tcp   CON  0.564998   14106  772406  ...           0           1   \n",
       "2059411      tcp   CON  0.564998   14106  772406  ...           0           2   \n",
       "2059412      tcp   FIN  0.087306     320    1828  ...           0           1   \n",
       "2059413      tcp   CON  2.200934    3498  166054  ...           0           1   \n",
       "2059414      tcp   CON  0.942984     574     676  ...           0           1   \n",
       "\n",
       "         ct_srv_dst  ct_dst_ltm ct_src_ltm  ct_src_dport_ltm  \\\n",
       "0                 7           1          3                 1   \n",
       "1                 4           2          3                 1   \n",
       "2                 8           1          2                 2   \n",
       "3                 9           1          1                 1   \n",
       "4                 9           1          1                 1   \n",
       "...             ...         ...        ...               ...   \n",
       "2059410           1           4          2                 2   \n",
       "2059411           1           4          2                 2   \n",
       "2059412           2           3          3                 1   \n",
       "2059413           1           2          4                 2   \n",
       "2059414           1           2          4                 2   \n",
       "\n",
       "         ct_dst_sport_ltm  ct_dst_src_ltm  attack_label  binary_label  \n",
       "0                       1               1        normal             0  \n",
       "1                       1               2        normal             0  \n",
       "2                       1               1        normal             0  \n",
       "3                       1               1        normal             0  \n",
       "4                       1               1        normal             0  \n",
       "...                   ...             ...           ...           ...  \n",
       "2059410                 2               2        normal             0  \n",
       "2059411                 2               2        normal             0  \n",
       "2059412                 1               3        normal             0  \n",
       "2059413                 2               2        normal             0  \n",
       "2059414                 2               2      exploits             1  \n",
       "\n",
       "[2059415 rows x 50 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "flow1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "e722d71b-3f14-4b72-a6cf-0d6a752406e0",
   "metadata": {},
   "outputs": [],
   "source": [
    "flow1 = flow1[['flow_id', 'source_ip', 'source_port', 'destination_ip', 'destination_port', 'protocol']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "bf3d9cba-0ed8-4174-b499-c7b3325483f6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>flow_id</th>\n",
       "      <th>source_ip</th>\n",
       "      <th>source_port</th>\n",
       "      <th>destination_ip</th>\n",
       "      <th>destination_port</th>\n",
       "      <th>protocol</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>1390</td>\n",
       "      <td>149.171.126.6</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>33661</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>1024</td>\n",
       "      <td>udp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>1464</td>\n",
       "      <td>149.171.126.7</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>3593</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>49664</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059410</th>\n",
       "      <td>2059411</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059411</th>\n",
       "      <td>2059412</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059412</th>\n",
       "      <td>2059413</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>33094</td>\n",
       "      <td>149.171.126.7</td>\n",
       "      <td>43433</td>\n",
       "      <td>tcp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059413</th>\n",
       "      <td>2059414</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35433</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059414</th>\n",
       "      <td>2059415</td>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>17293</td>\n",
       "      <td>149.171.126.17</td>\n",
       "      <td>110</td>\n",
       "      <td>tcp</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2059415 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         flow_id     source_ip  source_port  destination_ip  destination_port  \\\n",
       "0              1    59.166.0.0         1390   149.171.126.6                53   \n",
       "1              2    59.166.0.0        33661   149.171.126.9              1024   \n",
       "2              3    59.166.0.6         1464   149.171.126.7                53   \n",
       "3              4    59.166.0.5         3593   149.171.126.5                53   \n",
       "4              5    59.166.0.3        49664   149.171.126.0                53   \n",
       "...          ...           ...          ...             ...               ...   \n",
       "2059410  2059411    59.166.0.1        38606   149.171.126.9                80   \n",
       "2059411  2059412    59.166.0.1        38606   149.171.126.9                80   \n",
       "2059412  2059413    59.166.0.5        33094   149.171.126.7             43433   \n",
       "2059413  2059414    59.166.0.9        35433   149.171.126.0                80   \n",
       "2059414  2059415  175.45.176.0        17293  149.171.126.17               110   \n",
       "\n",
       "        protocol  \n",
       "0            udp  \n",
       "1            udp  \n",
       "2            udp  \n",
       "3            udp  \n",
       "4            udp  \n",
       "...          ...  \n",
       "2059410      tcp  \n",
       "2059411      tcp  \n",
       "2059412      tcp  \n",
       "2059413      tcp  \n",
       "2059414      tcp  \n",
       "\n",
       "[2059415 rows x 6 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "flow1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "3ff79d22-39a9-4c6f-ac45-e63e3cfee46c",
   "metadata": {},
   "outputs": [],
   "source": [
    "flow2 = flow1.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "8ccc8db5-2d78-4a89-94dd-3677a1f37a56",
   "metadata": {},
   "outputs": [],
   "source": [
    "flow2.rename(columns={'source_ip': 'destination_ip', 'destination_ip': 'source_ip', 'source_port': 'destination_port', 'destination_port': 'source_port'}, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "f7aefd04-d9c2-4f55-991d-f062bcb10d71",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>flow_id</th>\n",
       "      <th>destination_ip</th>\n",
       "      <th>destination_port</th>\n",
       "      <th>source_ip</th>\n",
       "      <th>source_port</th>\n",
       "      <th>protocol</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>1390</td>\n",
       "      <td>149.171.126.6</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>33661</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>1024</td>\n",
       "      <td>udp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>1464</td>\n",
       "      <td>149.171.126.7</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>3593</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>49664</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059410</th>\n",
       "      <td>2059411</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059411</th>\n",
       "      <td>2059412</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059412</th>\n",
       "      <td>2059413</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>33094</td>\n",
       "      <td>149.171.126.7</td>\n",
       "      <td>43433</td>\n",
       "      <td>tcp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059413</th>\n",
       "      <td>2059414</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35433</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059414</th>\n",
       "      <td>2059415</td>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>17293</td>\n",
       "      <td>149.171.126.17</td>\n",
       "      <td>110</td>\n",
       "      <td>tcp</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2059415 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         flow_id destination_ip  destination_port       source_ip  \\\n",
       "0              1     59.166.0.0              1390   149.171.126.6   \n",
       "1              2     59.166.0.0             33661   149.171.126.9   \n",
       "2              3     59.166.0.6              1464   149.171.126.7   \n",
       "3              4     59.166.0.5              3593   149.171.126.5   \n",
       "4              5     59.166.0.3             49664   149.171.126.0   \n",
       "...          ...            ...               ...             ...   \n",
       "2059410  2059411     59.166.0.1             38606   149.171.126.9   \n",
       "2059411  2059412     59.166.0.1             38606   149.171.126.9   \n",
       "2059412  2059413     59.166.0.5             33094   149.171.126.7   \n",
       "2059413  2059414     59.166.0.9             35433   149.171.126.0   \n",
       "2059414  2059415   175.45.176.0             17293  149.171.126.17   \n",
       "\n",
       "         source_port protocol  \n",
       "0                 53      udp  \n",
       "1               1024      udp  \n",
       "2                 53      udp  \n",
       "3                 53      udp  \n",
       "4                 53      udp  \n",
       "...              ...      ...  \n",
       "2059410           80      tcp  \n",
       "2059411           80      tcp  \n",
       "2059412        43433      tcp  \n",
       "2059413           80      tcp  \n",
       "2059414          110      tcp  \n",
       "\n",
       "[2059415 rows x 6 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "flow2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "e17913e8-2c7f-4592-9c30-ef4047188b47",
   "metadata": {},
   "outputs": [],
   "source": [
    "flow = pd.concat([flow1, flow2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "219b9bbd-6238-4e19-8286-6bc22aad1858",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>flow_id</th>\n",
       "      <th>source_ip</th>\n",
       "      <th>source_port</th>\n",
       "      <th>destination_ip</th>\n",
       "      <th>destination_port</th>\n",
       "      <th>protocol</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>1390</td>\n",
       "      <td>149.171.126.6</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>33661</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>1024</td>\n",
       "      <td>udp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>1464</td>\n",
       "      <td>149.171.126.7</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>3593</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>49664</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059410</th>\n",
       "      <td>2059411</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>tcp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059411</th>\n",
       "      <td>2059412</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>tcp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059412</th>\n",
       "      <td>2059413</td>\n",
       "      <td>149.171.126.7</td>\n",
       "      <td>43433</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>33094</td>\n",
       "      <td>tcp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059413</th>\n",
       "      <td>2059414</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>80</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35433</td>\n",
       "      <td>tcp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059414</th>\n",
       "      <td>2059415</td>\n",
       "      <td>149.171.126.17</td>\n",
       "      <td>110</td>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>17293</td>\n",
       "      <td>tcp</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>4118830 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         flow_id       source_ip  source_port destination_ip  \\\n",
       "0              1      59.166.0.0         1390  149.171.126.6   \n",
       "1              2      59.166.0.0        33661  149.171.126.9   \n",
       "2              3      59.166.0.6         1464  149.171.126.7   \n",
       "3              4      59.166.0.5         3593  149.171.126.5   \n",
       "4              5      59.166.0.3        49664  149.171.126.0   \n",
       "...          ...             ...          ...            ...   \n",
       "2059410  2059411   149.171.126.9           80     59.166.0.1   \n",
       "2059411  2059412   149.171.126.9           80     59.166.0.1   \n",
       "2059412  2059413   149.171.126.7        43433     59.166.0.5   \n",
       "2059413  2059414   149.171.126.0           80     59.166.0.9   \n",
       "2059414  2059415  149.171.126.17          110   175.45.176.0   \n",
       "\n",
       "         destination_port protocol  \n",
       "0                      53      udp  \n",
       "1                    1024      udp  \n",
       "2                      53      udp  \n",
       "3                      53      udp  \n",
       "4                      53      udp  \n",
       "...                   ...      ...  \n",
       "2059410             38606      tcp  \n",
       "2059411             38606      tcp  \n",
       "2059412             33094      tcp  \n",
       "2059413             35433      tcp  \n",
       "2059414             17293      tcp  \n",
       "\n",
       "[4118830 rows x 6 columns]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "flow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "a67aeb21-9a02-493c-8af0-e8d25f3ae426",
   "metadata": {},
   "outputs": [],
   "source": [
    "flow.drop_duplicates(subset=flow.columns.difference(['flow_id']), inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "7469ed0d-1794-4703-9091-6bc603d8c63f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>flow_id</th>\n",
       "      <th>source_ip</th>\n",
       "      <th>source_port</th>\n",
       "      <th>destination_ip</th>\n",
       "      <th>destination_port</th>\n",
       "      <th>protocol</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>1390</td>\n",
       "      <td>149.171.126.6</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>33661</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>1024</td>\n",
       "      <td>udp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>1464</td>\n",
       "      <td>149.171.126.7</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>3593</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>49664</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059406</th>\n",
       "      <td>2059407</td>\n",
       "      <td>149.171.126.4</td>\n",
       "      <td>21</td>\n",
       "      <td>59.166.0.7</td>\n",
       "      <td>20848</td>\n",
       "      <td>tcp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059407</th>\n",
       "      <td>2059408</td>\n",
       "      <td>149.171.126.2</td>\n",
       "      <td>80</td>\n",
       "      <td>59.166.0.4</td>\n",
       "      <td>59563</td>\n",
       "      <td>tcp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059408</th>\n",
       "      <td>2059409</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>80</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35433</td>\n",
       "      <td>tcp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059410</th>\n",
       "      <td>2059411</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>tcp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059412</th>\n",
       "      <td>2059413</td>\n",
       "      <td>149.171.126.7</td>\n",
       "      <td>43433</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>33094</td>\n",
       "      <td>tcp</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3912748 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         flow_id      source_ip  source_port destination_ip  destination_port  \\\n",
       "0              1     59.166.0.0         1390  149.171.126.6                53   \n",
       "1              2     59.166.0.0        33661  149.171.126.9              1024   \n",
       "2              3     59.166.0.6         1464  149.171.126.7                53   \n",
       "3              4     59.166.0.5         3593  149.171.126.5                53   \n",
       "4              5     59.166.0.3        49664  149.171.126.0                53   \n",
       "...          ...            ...          ...            ...               ...   \n",
       "2059406  2059407  149.171.126.4           21     59.166.0.7             20848   \n",
       "2059407  2059408  149.171.126.2           80     59.166.0.4             59563   \n",
       "2059408  2059409  149.171.126.0           80     59.166.0.9             35433   \n",
       "2059410  2059411  149.171.126.9           80     59.166.0.1             38606   \n",
       "2059412  2059413  149.171.126.7        43433     59.166.0.5             33094   \n",
       "\n",
       "        protocol  \n",
       "0            udp  \n",
       "1            udp  \n",
       "2            udp  \n",
       "3            udp  \n",
       "4            udp  \n",
       "...          ...  \n",
       "2059406      tcp  \n",
       "2059407      tcp  \n",
       "2059408      tcp  \n",
       "2059410      tcp  \n",
       "2059412      tcp  \n",
       "\n",
       "[3912748 rows x 6 columns]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "flow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "60ec73b2-672a-48c0-aedb-48700d42faea",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>source_ip</th>\n",
       "      <th>source_port</th>\n",
       "      <th>destination_ip</th>\n",
       "      <th>destination_port</th>\n",
       "      <th>protocol</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet_hex</th>\n",
       "      <th>...</th>\n",
       "      <th>DNS labels</th>\n",
       "      <th>DNS inception</th>\n",
       "      <th>DNS keytag</th>\n",
       "      <th>DNS signature</th>\n",
       "      <th>DNS nextname</th>\n",
       "      <th>TFTP Ack block</th>\n",
       "      <th>TFTP_Options oname</th>\n",
       "      <th>TFTP_Options value</th>\n",
       "      <th>TFTP Data block</th>\n",
       "      <th>LDAP present</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.424225e+09</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>143.0</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35632.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>180.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a5776300000800450000b4256940...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.424225e+09</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>143.0</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35632.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000034256840...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.424225e+09</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>143.0</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35632.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000034256840...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.424225e+09</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>143.0</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35632.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>73.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000049256b40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.424225e+09</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>143.0</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35632.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>73.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000049256b40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999995</th>\n",
       "      <td>1.424230e+09</td>\n",
       "      <td>59.166.0.2</td>\n",
       "      <td>32339.0</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>6881.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac5000000000008004500003428f240...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999996</th>\n",
       "      <td>1.424230e+09</td>\n",
       "      <td>59.166.0.2</td>\n",
       "      <td>32339.0</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>6881.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac5000000000008004500003428fe40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999997</th>\n",
       "      <td>1.424230e+09</td>\n",
       "      <td>59.166.0.2</td>\n",
       "      <td>32339.0</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>6881.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>31.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c2000008004500003428fe40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999998</th>\n",
       "      <td>1.424230e+09</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>43805.0</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>143.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac50000000000080045000034865d40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999999</th>\n",
       "      <td>1.424230e+09</td>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>18965.0</td>\n",
       "      <td>149.171.126.6</td>\n",
       "      <td>80.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>31.0</td>\n",
       "      <td>285.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c2000008004500011d5f9840...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10000000 rows × 397 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                stime      source_ip  source_port destination_ip  \\\n",
       "0        1.424225e+09  149.171.126.1        143.0     59.166.0.9   \n",
       "1        1.424225e+09  149.171.126.1        143.0     59.166.0.9   \n",
       "2        1.424225e+09  149.171.126.1        143.0     59.166.0.9   \n",
       "3        1.424225e+09  149.171.126.1        143.0     59.166.0.9   \n",
       "4        1.424225e+09  149.171.126.1        143.0     59.166.0.9   \n",
       "...               ...            ...          ...            ...   \n",
       "9999995  1.424230e+09     59.166.0.2      32339.0  149.171.126.5   \n",
       "9999996  1.424230e+09     59.166.0.2      32339.0  149.171.126.5   \n",
       "9999997  1.424230e+09     59.166.0.2      32339.0  149.171.126.5   \n",
       "9999998  1.424230e+09     59.166.0.3      43805.0  149.171.126.9   \n",
       "9999999  1.424230e+09     59.166.0.6      18965.0  149.171.126.6   \n",
       "\n",
       "         destination_port protocol  sttl  total_len   first_layer  \\\n",
       "0                 35632.0      tcp  29.0      180.0  cooked linux   \n",
       "1                 35632.0      tcp  30.0       52.0  cooked linux   \n",
       "2                 35632.0      tcp  29.0       52.0  cooked linux   \n",
       "3                 35632.0      tcp  30.0       73.0  cooked linux   \n",
       "4                 35632.0      tcp  29.0       73.0  cooked linux   \n",
       "...                   ...      ...   ...        ...           ...   \n",
       "9999995            6881.0      tcp  32.0       52.0  cooked linux   \n",
       "9999996            6881.0      tcp  32.0       52.0  cooked linux   \n",
       "9999997            6881.0      tcp  31.0       52.0  cooked linux   \n",
       "9999998             143.0      tcp  32.0       52.0  cooked linux   \n",
       "9999999              80.0      tcp  31.0      285.0  cooked linux   \n",
       "\n",
       "                                                packet_hex  ... DNS labels  \\\n",
       "0        000400010006005056a5776300000800450000b4256940...  ...        NaN   \n",
       "1        000000010006001b17059e1c0000080045000034256840...  ...        NaN   \n",
       "2        000400010006005056a577630000080045000034256840...  ...        NaN   \n",
       "3        000000010006001b17059e1c0000080045000049256b40...  ...        NaN   \n",
       "4        000400010006005056a577630000080045000049256b40...  ...        NaN   \n",
       "...                                                    ...  ...        ...   \n",
       "9999995  000000010006021ac5000000000008004500003428f240...  ...        NaN   \n",
       "9999996  000000010006021ac5000000000008004500003428fe40...  ...        NaN   \n",
       "9999997  000400010006005056a524c2000008004500003428fe40...  ...        NaN   \n",
       "9999998  000000010006021ac50000000000080045000034865d40...  ...        NaN   \n",
       "9999999  000400010006005056a524c2000008004500011d5f9840...  ...        NaN   \n",
       "\n",
       "         DNS inception  DNS keytag DNS signature  DNS nextname  \\\n",
       "0                  NaN         NaN           NaN           NaN   \n",
       "1                  NaN         NaN           NaN           NaN   \n",
       "2                  NaN         NaN           NaN           NaN   \n",
       "3                  NaN         NaN           NaN           NaN   \n",
       "4                  NaN         NaN           NaN           NaN   \n",
       "...                ...         ...           ...           ...   \n",
       "9999995            NaN         NaN           NaN           NaN   \n",
       "9999996            NaN         NaN           NaN           NaN   \n",
       "9999997            NaN         NaN           NaN           NaN   \n",
       "9999998            NaN         NaN           NaN           NaN   \n",
       "9999999            NaN         NaN           NaN           NaN   \n",
       "\n",
       "         TFTP Ack block TFTP_Options oname  TFTP_Options value  \\\n",
       "0                   NaN                NaN                 NaN   \n",
       "1                   NaN                NaN                 NaN   \n",
       "2                   NaN                NaN                 NaN   \n",
       "3                   NaN                NaN                 NaN   \n",
       "4                   NaN                NaN                 NaN   \n",
       "...                 ...                ...                 ...   \n",
       "9999995             NaN                NaN                 NaN   \n",
       "9999996             NaN                NaN                 NaN   \n",
       "9999997             NaN                NaN                 NaN   \n",
       "9999998             NaN                NaN                 NaN   \n",
       "9999999             NaN                NaN                 NaN   \n",
       "\n",
       "        TFTP Data block LDAP present  \n",
       "0                   NaN          NaN  \n",
       "1                   NaN          NaN  \n",
       "2                   NaN          NaN  \n",
       "3                   NaN          NaN  \n",
       "4                   NaN          NaN  \n",
       "...                 ...          ...  \n",
       "9999995             NaN          NaN  \n",
       "9999996             NaN          NaN  \n",
       "9999997             NaN          NaN  \n",
       "9999998             NaN          NaN  \n",
       "9999999             NaN          NaN  \n",
       "\n",
       "[10000000 rows x 397 columns]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "19ae7493-353c-4fa2-8cdc-a82d1bf73618",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['source_port'] = df['source_port'].astype(int)\n",
    "df['destination_port'] = df['destination_port'].astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "ff0ed3c7-5937-48f1-ba25-c2a7bbe78077",
   "metadata": {},
   "outputs": [],
   "source": [
    "columns_to_match = ['source_ip', 'source_port', 'destination_ip', 'destination_port', 'protocol']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "20eae60d-d3fb-4fe0-8e2a-2454d86380d6",
   "metadata": {},
   "outputs": [],
   "source": [
    "merged_df = df.merge(flow, on=columns_to_match, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "acc23f40-dea2-460f-94ab-116992d9809f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>source_ip</th>\n",
       "      <th>source_port</th>\n",
       "      <th>destination_ip</th>\n",
       "      <th>destination_port</th>\n",
       "      <th>protocol</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet_hex</th>\n",
       "      <th>...</th>\n",
       "      <th>DNS inception</th>\n",
       "      <th>DNS keytag</th>\n",
       "      <th>DNS signature</th>\n",
       "      <th>DNS nextname</th>\n",
       "      <th>TFTP Ack block</th>\n",
       "      <th>TFTP_Options oname</th>\n",
       "      <th>TFTP_Options value</th>\n",
       "      <th>TFTP Data block</th>\n",
       "      <th>LDAP present</th>\n",
       "      <th>flow_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.424225e+09</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>143</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35632</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>180.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a5776300000800450000b4256940...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1127213</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.424225e+09</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>143</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35632</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000034256840...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1127213</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.424225e+09</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>143</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35632</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000034256840...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1127213</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.424225e+09</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>143</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35632</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>73.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000049256b40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1127213</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.424225e+09</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>143</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35632</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>73.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000049256b40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1127213</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999995</th>\n",
       "      <td>1.424230e+09</td>\n",
       "      <td>59.166.0.2</td>\n",
       "      <td>32339</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>6881</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac5000000000008004500003428f240...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1241519</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999996</th>\n",
       "      <td>1.424230e+09</td>\n",
       "      <td>59.166.0.2</td>\n",
       "      <td>32339</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>6881</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac5000000000008004500003428fe40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1241519</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999997</th>\n",
       "      <td>1.424230e+09</td>\n",
       "      <td>59.166.0.2</td>\n",
       "      <td>32339</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>6881</td>\n",
       "      <td>tcp</td>\n",
       "      <td>31.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c2000008004500003428fe40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1241519</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999998</th>\n",
       "      <td>1.424230e+09</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>43805</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>143</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac50000000000080045000034865d40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1241468</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999999</th>\n",
       "      <td>1.424230e+09</td>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>18965</td>\n",
       "      <td>149.171.126.6</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "      <td>31.0</td>\n",
       "      <td>285.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c2000008004500011d5f9840...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1241456</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10000000 rows × 398 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                stime      source_ip  source_port destination_ip  \\\n",
       "0        1.424225e+09  149.171.126.1          143     59.166.0.9   \n",
       "1        1.424225e+09  149.171.126.1          143     59.166.0.9   \n",
       "2        1.424225e+09  149.171.126.1          143     59.166.0.9   \n",
       "3        1.424225e+09  149.171.126.1          143     59.166.0.9   \n",
       "4        1.424225e+09  149.171.126.1          143     59.166.0.9   \n",
       "...               ...            ...          ...            ...   \n",
       "9999995  1.424230e+09     59.166.0.2        32339  149.171.126.5   \n",
       "9999996  1.424230e+09     59.166.0.2        32339  149.171.126.5   \n",
       "9999997  1.424230e+09     59.166.0.2        32339  149.171.126.5   \n",
       "9999998  1.424230e+09     59.166.0.3        43805  149.171.126.9   \n",
       "9999999  1.424230e+09     59.166.0.6        18965  149.171.126.6   \n",
       "\n",
       "         destination_port protocol  sttl  total_len   first_layer  \\\n",
       "0                   35632      tcp  29.0      180.0  cooked linux   \n",
       "1                   35632      tcp  30.0       52.0  cooked linux   \n",
       "2                   35632      tcp  29.0       52.0  cooked linux   \n",
       "3                   35632      tcp  30.0       73.0  cooked linux   \n",
       "4                   35632      tcp  29.0       73.0  cooked linux   \n",
       "...                   ...      ...   ...        ...           ...   \n",
       "9999995              6881      tcp  32.0       52.0  cooked linux   \n",
       "9999996              6881      tcp  32.0       52.0  cooked linux   \n",
       "9999997              6881      tcp  31.0       52.0  cooked linux   \n",
       "9999998               143      tcp  32.0       52.0  cooked linux   \n",
       "9999999                80      tcp  31.0      285.0  cooked linux   \n",
       "\n",
       "                                                packet_hex  ... DNS inception  \\\n",
       "0        000400010006005056a5776300000800450000b4256940...  ...           NaN   \n",
       "1        000000010006001b17059e1c0000080045000034256840...  ...           NaN   \n",
       "2        000400010006005056a577630000080045000034256840...  ...           NaN   \n",
       "3        000000010006001b17059e1c0000080045000049256b40...  ...           NaN   \n",
       "4        000400010006005056a577630000080045000049256b40...  ...           NaN   \n",
       "...                                                    ...  ...           ...   \n",
       "9999995  000000010006021ac5000000000008004500003428f240...  ...           NaN   \n",
       "9999996  000000010006021ac5000000000008004500003428fe40...  ...           NaN   \n",
       "9999997  000400010006005056a524c2000008004500003428fe40...  ...           NaN   \n",
       "9999998  000000010006021ac50000000000080045000034865d40...  ...           NaN   \n",
       "9999999  000400010006005056a524c2000008004500011d5f9840...  ...           NaN   \n",
       "\n",
       "         DNS keytag  DNS signature DNS nextname  TFTP Ack block  \\\n",
       "0               NaN            NaN          NaN             NaN   \n",
       "1               NaN            NaN          NaN             NaN   \n",
       "2               NaN            NaN          NaN             NaN   \n",
       "3               NaN            NaN          NaN             NaN   \n",
       "4               NaN            NaN          NaN             NaN   \n",
       "...             ...            ...          ...             ...   \n",
       "9999995         NaN            NaN          NaN             NaN   \n",
       "9999996         NaN            NaN          NaN             NaN   \n",
       "9999997         NaN            NaN          NaN             NaN   \n",
       "9999998         NaN            NaN          NaN             NaN   \n",
       "9999999         NaN            NaN          NaN             NaN   \n",
       "\n",
       "         TFTP_Options oname TFTP_Options value  TFTP Data block LDAP present  \\\n",
       "0                       NaN                NaN              NaN          NaN   \n",
       "1                       NaN                NaN              NaN          NaN   \n",
       "2                       NaN                NaN              NaN          NaN   \n",
       "3                       NaN                NaN              NaN          NaN   \n",
       "4                       NaN                NaN              NaN          NaN   \n",
       "...                     ...                ...              ...          ...   \n",
       "9999995                 NaN                NaN              NaN          NaN   \n",
       "9999996                 NaN                NaN              NaN          NaN   \n",
       "9999997                 NaN                NaN              NaN          NaN   \n",
       "9999998                 NaN                NaN              NaN          NaN   \n",
       "9999999                 NaN                NaN              NaN          NaN   \n",
       "\n",
       "         flow_id  \n",
       "0        1127213  \n",
       "1        1127213  \n",
       "2        1127213  \n",
       "3        1127213  \n",
       "4        1127213  \n",
       "...          ...  \n",
       "9999995  1241519  \n",
       "9999996  1241519  \n",
       "9999997  1241519  \n",
       "9999998  1241468  \n",
       "9999999  1241456  \n",
       "\n",
       "[10000000 rows x 398 columns]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merged_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "e5e9e9b8-d64c-4aa4-982f-4b5fe04a10b4",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_3115979/2896930172.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n",
      "  merged_df.insert(0, 'flow_id', flow_id)\n"
     ]
    }
   ],
   "source": [
    "flow_id = merged_df.pop('flow_id')\n",
    "merged_df.insert(0, 'flow_id', flow_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "acae3b63-03ba-4f65-ae1e-29e373a6c3e7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>flow_id</th>\n",
       "      <th>stime</th>\n",
       "      <th>source_ip</th>\n",
       "      <th>source_port</th>\n",
       "      <th>destination_ip</th>\n",
       "      <th>destination_port</th>\n",
       "      <th>protocol</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>...</th>\n",
       "      <th>DNS labels</th>\n",
       "      <th>DNS inception</th>\n",
       "      <th>DNS keytag</th>\n",
       "      <th>DNS signature</th>\n",
       "      <th>DNS nextname</th>\n",
       "      <th>TFTP Ack block</th>\n",
       "      <th>TFTP_Options oname</th>\n",
       "      <th>TFTP_Options value</th>\n",
       "      <th>TFTP Data block</th>\n",
       "      <th>LDAP present</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1127213</td>\n",
       "      <td>1.424225e+09</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>143</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35632</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>180.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1127213</td>\n",
       "      <td>1.424225e+09</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>143</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35632</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1127213</td>\n",
       "      <td>1.424225e+09</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>143</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35632</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1127213</td>\n",
       "      <td>1.424225e+09</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>143</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35632</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>73.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1127213</td>\n",
       "      <td>1.424225e+09</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>143</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35632</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>73.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999995</th>\n",
       "      <td>1241519</td>\n",
       "      <td>1.424230e+09</td>\n",
       "      <td>59.166.0.2</td>\n",
       "      <td>32339</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>6881</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999996</th>\n",
       "      <td>1241519</td>\n",
       "      <td>1.424230e+09</td>\n",
       "      <td>59.166.0.2</td>\n",
       "      <td>32339</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>6881</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999997</th>\n",
       "      <td>1241519</td>\n",
       "      <td>1.424230e+09</td>\n",
       "      <td>59.166.0.2</td>\n",
       "      <td>32339</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>6881</td>\n",
       "      <td>tcp</td>\n",
       "      <td>31.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999998</th>\n",
       "      <td>1241468</td>\n",
       "      <td>1.424230e+09</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>43805</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>143</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999999</th>\n",
       "      <td>1241456</td>\n",
       "      <td>1.424230e+09</td>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>18965</td>\n",
       "      <td>149.171.126.6</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "      <td>31.0</td>\n",
       "      <td>285.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10000000 rows × 398 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         flow_id         stime      source_ip  source_port destination_ip  \\\n",
       "0        1127213  1.424225e+09  149.171.126.1          143     59.166.0.9   \n",
       "1        1127213  1.424225e+09  149.171.126.1          143     59.166.0.9   \n",
       "2        1127213  1.424225e+09  149.171.126.1          143     59.166.0.9   \n",
       "3        1127213  1.424225e+09  149.171.126.1          143     59.166.0.9   \n",
       "4        1127213  1.424225e+09  149.171.126.1          143     59.166.0.9   \n",
       "...          ...           ...            ...          ...            ...   \n",
       "9999995  1241519  1.424230e+09     59.166.0.2        32339  149.171.126.5   \n",
       "9999996  1241519  1.424230e+09     59.166.0.2        32339  149.171.126.5   \n",
       "9999997  1241519  1.424230e+09     59.166.0.2        32339  149.171.126.5   \n",
       "9999998  1241468  1.424230e+09     59.166.0.3        43805  149.171.126.9   \n",
       "9999999  1241456  1.424230e+09     59.166.0.6        18965  149.171.126.6   \n",
       "\n",
       "         destination_port protocol  sttl  total_len   first_layer  ...  \\\n",
       "0                   35632      tcp  29.0      180.0  cooked linux  ...   \n",
       "1                   35632      tcp  30.0       52.0  cooked linux  ...   \n",
       "2                   35632      tcp  29.0       52.0  cooked linux  ...   \n",
       "3                   35632      tcp  30.0       73.0  cooked linux  ...   \n",
       "4                   35632      tcp  29.0       73.0  cooked linux  ...   \n",
       "...                   ...      ...   ...        ...           ...  ...   \n",
       "9999995              6881      tcp  32.0       52.0  cooked linux  ...   \n",
       "9999996              6881      tcp  32.0       52.0  cooked linux  ...   \n",
       "9999997              6881      tcp  31.0       52.0  cooked linux  ...   \n",
       "9999998               143      tcp  32.0       52.0  cooked linux  ...   \n",
       "9999999                80      tcp  31.0      285.0  cooked linux  ...   \n",
       "\n",
       "        DNS labels DNS inception  DNS keytag  DNS signature DNS nextname  \\\n",
       "0              NaN           NaN         NaN            NaN          NaN   \n",
       "1              NaN           NaN         NaN            NaN          NaN   \n",
       "2              NaN           NaN         NaN            NaN          NaN   \n",
       "3              NaN           NaN         NaN            NaN          NaN   \n",
       "4              NaN           NaN         NaN            NaN          NaN   \n",
       "...            ...           ...         ...            ...          ...   \n",
       "9999995        NaN           NaN         NaN            NaN          NaN   \n",
       "9999996        NaN           NaN         NaN            NaN          NaN   \n",
       "9999997        NaN           NaN         NaN            NaN          NaN   \n",
       "9999998        NaN           NaN         NaN            NaN          NaN   \n",
       "9999999        NaN           NaN         NaN            NaN          NaN   \n",
       "\n",
       "         TFTP Ack block  TFTP_Options oname TFTP_Options value  \\\n",
       "0                   NaN                 NaN                NaN   \n",
       "1                   NaN                 NaN                NaN   \n",
       "2                   NaN                 NaN                NaN   \n",
       "3                   NaN                 NaN                NaN   \n",
       "4                   NaN                 NaN                NaN   \n",
       "...                 ...                 ...                ...   \n",
       "9999995             NaN                 NaN                NaN   \n",
       "9999996             NaN                 NaN                NaN   \n",
       "9999997             NaN                 NaN                NaN   \n",
       "9999998             NaN                 NaN                NaN   \n",
       "9999999             NaN                 NaN                NaN   \n",
       "\n",
       "         TFTP Data block LDAP present  \n",
       "0                    NaN          NaN  \n",
       "1                    NaN          NaN  \n",
       "2                    NaN          NaN  \n",
       "3                    NaN          NaN  \n",
       "4                    NaN          NaN  \n",
       "...                  ...          ...  \n",
       "9999995              NaN          NaN  \n",
       "9999996              NaN          NaN  \n",
       "9999997              NaN          NaN  \n",
       "9999998              NaN          NaN  \n",
       "9999999              NaN          NaN  \n",
       "\n",
       "[10000000 rows x 398 columns]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merged_df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "07d3921f-3c6a-4877-acd6-223a80b63c35",
   "metadata": {},
   "source": [
    "# Testing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "25aac04e-1ba9-4ce0-90c9-ee360b0d8db4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merged_df.flow_id.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 271,
   "id": "586eced8-7534-4ec0-9ea2-f87286605479",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "stime                                1421927414\n",
       "source_ip                           10.40.182.3\n",
       "source_port                                   0\n",
       "destination_ip                      10.40.182.1\n",
       "destination_port                              0\n",
       "                                       ...     \n",
       "HSRP MD5 Authentication sourceip            NaN\n",
       "HSRP MD5 Authentication keyid               NaN\n",
       "SCTPChunkInit addr                          NaN\n",
       "flow_id                                148629.0\n",
       "protocol_y                                  arp\n",
       "Name: 13, Length: 389, dtype: object"
      ]
     },
     "execution_count": 271,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merged_df.iloc[13]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 254,
   "id": "fdba8adc-d5eb-4c80-9eec-d02f09a272e2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>flow_id</th>\n",
       "      <th>source_ip</th>\n",
       "      <th>source_port</th>\n",
       "      <th>destination_ip</th>\n",
       "      <th>destination_port</th>\n",
       "      <th>protocol</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [flow_id, source_ip, source_port, destination_ip, destination_port, protocol]\n",
       "Index: []"
      ]
     },
     "execution_count": 254,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "flow[flow.protocol=='others']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 272,
   "id": "3c36a765-96e3-4b67-a3e2-c59065081a5f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>flow_id</th>\n",
       "      <th>source_ip</th>\n",
       "      <th>source_port</th>\n",
       "      <th>destination_ip</th>\n",
       "      <th>destination_port</th>\n",
       "      <th>protocol</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1966462</th>\n",
       "      <td>521</td>\n",
       "      <td>10.40.182.3</td>\n",
       "      <td>0</td>\n",
       "      <td>10.40.182.1</td>\n",
       "      <td>0</td>\n",
       "      <td>others</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         flow_id    source_ip  source_port destination_ip  destination_port  \\\n",
       "1966462      521  10.40.182.3            0    10.40.182.1                 0   \n",
       "\n",
       "        protocol  \n",
       "1966462   others  "
      ]
     },
     "execution_count": 272,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "flow[(flow['source_ip']=='10.40.182.3') & (flow['destination_ip']=='10.40.182.1') & (flow['source_port']==0)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 212,
   "id": "0ed49900-c32b-4183-8f3a-e00a28d29984",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>flow_id</th>\n",
       "      <th>source_ip</th>\n",
       "      <th>source_port</th>\n",
       "      <th>destination_ip</th>\n",
       "      <th>destination_port</th>\n",
       "      <th>protocol</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>111536</th>\n",
       "      <td>115902</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>6103</td>\n",
       "      <td>149.171.126.4</td>\n",
       "      <td>52633</td>\n",
       "      <td>tcp</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        flow_id   source_ip source_port destination_ip destination_port  \\\n",
       "111536   115902  59.166.0.3        6103  149.171.126.4            52633   \n",
       "\n",
       "       protocol  \n",
       "111536      tcp  "
      ]
     },
     "execution_count": 212,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "flow[(flow['source_ip']=='59.166.0.3') & (flow['destination_ip']=='149.171.126.4') & (flow['source_port']==6103) & (flow['destination_port']=='52633')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "id": "7ffb1cd4-bb8f-4a3c-aa32-2b7b9170e375",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "NaN         9935521\n",
       "664410.0       1904\n",
       "791440.0       1312\n",
       "806229.0       1308\n",
       "752940.0       1308\n",
       "             ...   \n",
       "346583.0          4\n",
       "723549.0          4\n",
       "821934.0          2\n",
       "189265.0          2\n",
       "149675.0          2\n",
       "Name: flow_id, Length: 847, dtype: int64"
      ]
     },
     "execution_count": 175,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merged_df.flow_id.value_counts(dropna=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "d71ef1d6-0933-4c1a-bb69-e6a0a5473252",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.\n",
      "Please recompile with CMake option -DUSE_GPU=1\n",
      "[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.\n",
      "Please recompile with CMake option -DUSE_CUDA=1\n",
      "[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.\n",
      "Please recompile with CMake option -DUSE_GPU=1\n",
      "[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.\n",
      "Please recompile with CMake option -DUSE_CUDA=1\n",
      "[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.\n",
      "Please recompile with CMake option -DUSE_GPU=1\n",
      "[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.\n",
      "Please recompile with CMake option -DUSE_CUDA=1\n",
      "[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.\n",
      "Please recompile with CMake option -DUSE_GPU=1\n",
      "[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.\n",
      "Please recompile with CMake option -DUSE_CUDA=1\n",
      "[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.\n",
      "Please recompile with CMake option -DUSE_GPU=1\n",
      "[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.\n",
      "Please recompile with CMake option -DUSE_CUDA=1\n",
      "[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.\n",
      "Please recompile with CMake option -DUSE_GPU=1\n",
      "[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.\n",
      "Please recompile with CMake option -DUSE_CUDA=1\n",
      "[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.\n",
      "Please recompile with CMake option -DUSE_GPU=1\n",
      "[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.\n",
      "Please recompile with CMake option -DUSE_CUDA=1\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe5fa4baf70>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe6b7db0040>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe6b7db0040>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe6bc88d280>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe6b7db0040>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe6b7db0040>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe6b7db0040>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe6b7db0040>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe6b7db0790>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe6bc88d280>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe5fa4baf70>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe6bc88d280>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe5fa4baf70>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe6bc88d280>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe5fa4baf70>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe6bc88d280>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe5fa4baf70>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe5fa4baf70>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe5fa4baee0>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe5fa4baf70>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe5fa4bab80>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe5fa4baaf0>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe5fa4ba310>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe5fa4baaf0>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe5fa4bab80>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe5fa4baaf0>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n",
      "Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe5fa4bad30>\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n",
      "    self._make_module_from_path(filepath)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n",
      "    module = module_class(filepath, prefix, user_api, internal_api)\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 606, in __init__\n",
      "    self.version = self.get_version()\n",
      "  File \"/opt/jupyter-hub/lib/python3.9/site-packages/threadpoolctl.py\", line 646, in get_version\n",
      "    config = get_config().split()\n",
      "AttributeError: 'NoneType' object has no attribute 'split'\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<style type=\"text/css\">\n",
       "#T_bc86a_row11_col1, #T_bc86a_row17_col1, #T_bc86a_row19_col1, #T_bc86a_row21_col1, #T_bc86a_row28_col1 {\n",
       "  background-color: lightgreen;\n",
       "}\n",
       "</style>\n",
       "<table id=\"T_bc86a\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th class=\"blank level0\" >&nbsp;</th>\n",
       "      <th id=\"T_bc86a_level0_col0\" class=\"col_heading level0 col0\" >Description</th>\n",
       "      <th id=\"T_bc86a_level0_col1\" class=\"col_heading level0 col1\" >Value</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row0\" class=\"row_heading level0 row0\" >0</th>\n",
       "      <td id=\"T_bc86a_row0_col0\" class=\"data row0 col0\" >Session id</td>\n",
       "      <td id=\"T_bc86a_row0_col1\" class=\"data row0 col1\" >123</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row1\" class=\"row_heading level0 row1\" >1</th>\n",
       "      <td id=\"T_bc86a_row1_col0\" class=\"data row1 col0\" >Target</td>\n",
       "      <td id=\"T_bc86a_row1_col1\" class=\"data row1 col1\" >attack_label</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row2\" class=\"row_heading level0 row2\" >2</th>\n",
       "      <td id=\"T_bc86a_row2_col0\" class=\"data row2 col0\" >Target type</td>\n",
       "      <td id=\"T_bc86a_row2_col1\" class=\"data row2 col1\" >Multiclass</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row3\" class=\"row_heading level0 row3\" >3</th>\n",
       "      <td id=\"T_bc86a_row3_col0\" class=\"data row3 col0\" >Target mapping</td>\n",
       "      <td id=\"T_bc86a_row3_col1\" class=\"data row3 col1\" >analysis: 0, backdoor: 1, dos: 2, exploits: 3, fuzzers: 4, generic: 5, normal: 6, reconnaissance: 7, shellcode: 8, worms: 9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row4\" class=\"row_heading level0 row4\" >4</th>\n",
       "      <td id=\"T_bc86a_row4_col0\" class=\"data row4 col0\" >Original data shape</td>\n",
       "      <td id=\"T_bc86a_row4_col1\" class=\"data row4 col1\" >(2059415, 50)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row5\" class=\"row_heading level0 row5\" >5</th>\n",
       "      <td id=\"T_bc86a_row5_col0\" class=\"data row5 col0\" >Transformed data shape</td>\n",
       "      <td id=\"T_bc86a_row5_col1\" class=\"data row5 col1\" >(14336225, 10)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row6\" class=\"row_heading level0 row6\" >6</th>\n",
       "      <td id=\"T_bc86a_row6_col0\" class=\"data row6 col0\" >Transformed train set shape</td>\n",
       "      <td id=\"T_bc86a_row6_col1\" class=\"data row6 col1\" >(13718400, 10)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row7\" class=\"row_heading level0 row7\" >7</th>\n",
       "      <td id=\"T_bc86a_row7_col0\" class=\"data row7 col0\" >Transformed test set shape</td>\n",
       "      <td id=\"T_bc86a_row7_col1\" class=\"data row7 col1\" >(617825, 10)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row8\" class=\"row_heading level0 row8\" >8</th>\n",
       "      <td id=\"T_bc86a_row8_col0\" class=\"data row8 col0\" >Numeric features</td>\n",
       "      <td id=\"T_bc86a_row8_col1\" class=\"data row8 col1\" >44</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row9\" class=\"row_heading level0 row9\" >9</th>\n",
       "      <td id=\"T_bc86a_row9_col0\" class=\"data row9 col0\" >Categorical features</td>\n",
       "      <td id=\"T_bc86a_row9_col1\" class=\"data row9 col1\" >5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row10\" class=\"row_heading level0 row10\" >10</th>\n",
       "      <td id=\"T_bc86a_row10_col0\" class=\"data row10 col0\" >Rows with missing values</td>\n",
       "      <td id=\"T_bc86a_row10_col1\" class=\"data row10 col1\" >50.3%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row11\" class=\"row_heading level0 row11\" >11</th>\n",
       "      <td id=\"T_bc86a_row11_col0\" class=\"data row11 col0\" >Preprocess</td>\n",
       "      <td id=\"T_bc86a_row11_col1\" class=\"data row11 col1\" >True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row12\" class=\"row_heading level0 row12\" >12</th>\n",
       "      <td id=\"T_bc86a_row12_col0\" class=\"data row12 col0\" >Imputation type</td>\n",
       "      <td id=\"T_bc86a_row12_col1\" class=\"data row12 col1\" >simple</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row13\" class=\"row_heading level0 row13\" >13</th>\n",
       "      <td id=\"T_bc86a_row13_col0\" class=\"data row13 col0\" >Numeric imputation</td>\n",
       "      <td id=\"T_bc86a_row13_col1\" class=\"data row13 col1\" >mean</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row14\" class=\"row_heading level0 row14\" >14</th>\n",
       "      <td id=\"T_bc86a_row14_col0\" class=\"data row14 col0\" >Categorical imputation</td>\n",
       "      <td id=\"T_bc86a_row14_col1\" class=\"data row14 col1\" >mode</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row15\" class=\"row_heading level0 row15\" >15</th>\n",
       "      <td id=\"T_bc86a_row15_col0\" class=\"data row15 col0\" >Maximum one-hot encoding</td>\n",
       "      <td id=\"T_bc86a_row15_col1\" class=\"data row15 col1\" >25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row16\" class=\"row_heading level0 row16\" >16</th>\n",
       "      <td id=\"T_bc86a_row16_col0\" class=\"data row16 col0\" >Encoding method</td>\n",
       "      <td id=\"T_bc86a_row16_col1\" class=\"data row16 col1\" >None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row17\" class=\"row_heading level0 row17\" >17</th>\n",
       "      <td id=\"T_bc86a_row17_col0\" class=\"data row17 col0\" >Fix imbalance</td>\n",
       "      <td id=\"T_bc86a_row17_col1\" class=\"data row17 col1\" >True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row18\" class=\"row_heading level0 row18\" >18</th>\n",
       "      <td id=\"T_bc86a_row18_col0\" class=\"data row18 col0\" >Fix imbalance method</td>\n",
       "      <td id=\"T_bc86a_row18_col1\" class=\"data row18 col1\" >SMOTE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row19\" class=\"row_heading level0 row19\" >19</th>\n",
       "      <td id=\"T_bc86a_row19_col0\" class=\"data row19 col0\" >Normalize</td>\n",
       "      <td id=\"T_bc86a_row19_col1\" class=\"data row19 col1\" >True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row20\" class=\"row_heading level0 row20\" >20</th>\n",
       "      <td id=\"T_bc86a_row20_col0\" class=\"data row20 col0\" >Normalize method</td>\n",
       "      <td id=\"T_bc86a_row20_col1\" class=\"data row20 col1\" >zscore</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row21\" class=\"row_heading level0 row21\" >21</th>\n",
       "      <td id=\"T_bc86a_row21_col0\" class=\"data row21 col0\" >Feature selection</td>\n",
       "      <td id=\"T_bc86a_row21_col1\" class=\"data row21 col1\" >True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row22\" class=\"row_heading level0 row22\" >22</th>\n",
       "      <td id=\"T_bc86a_row22_col0\" class=\"data row22 col0\" >Feature selection method</td>\n",
       "      <td id=\"T_bc86a_row22_col1\" class=\"data row22 col1\" >classic</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row23\" class=\"row_heading level0 row23\" >23</th>\n",
       "      <td id=\"T_bc86a_row23_col0\" class=\"data row23 col0\" >Feature selection estimator</td>\n",
       "      <td id=\"T_bc86a_row23_col1\" class=\"data row23 col1\" >lightgbm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row24\" class=\"row_heading level0 row24\" >24</th>\n",
       "      <td id=\"T_bc86a_row24_col0\" class=\"data row24 col0\" >Number of features selected</td>\n",
       "      <td id=\"T_bc86a_row24_col1\" class=\"data row24 col1\" >0.200000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row25\" class=\"row_heading level0 row25\" >25</th>\n",
       "      <td id=\"T_bc86a_row25_col0\" class=\"data row25 col0\" >Fold Generator</td>\n",
       "      <td id=\"T_bc86a_row25_col1\" class=\"data row25 col1\" >StratifiedKFold</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row26\" class=\"row_heading level0 row26\" >26</th>\n",
       "      <td id=\"T_bc86a_row26_col0\" class=\"data row26 col0\" >Fold Number</td>\n",
       "      <td id=\"T_bc86a_row26_col1\" class=\"data row26 col1\" >10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row27\" class=\"row_heading level0 row27\" >27</th>\n",
       "      <td id=\"T_bc86a_row27_col0\" class=\"data row27 col0\" >CPU Jobs</td>\n",
       "      <td id=\"T_bc86a_row27_col1\" class=\"data row27 col1\" >-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row28\" class=\"row_heading level0 row28\" >28</th>\n",
       "      <td id=\"T_bc86a_row28_col0\" class=\"data row28 col0\" >Use GPU</td>\n",
       "      <td id=\"T_bc86a_row28_col1\" class=\"data row28 col1\" >True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row29\" class=\"row_heading level0 row29\" >29</th>\n",
       "      <td id=\"T_bc86a_row29_col0\" class=\"data row29 col0\" >Log Experiment</td>\n",
       "      <td id=\"T_bc86a_row29_col1\" class=\"data row29 col1\" >False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row30\" class=\"row_heading level0 row30\" >30</th>\n",
       "      <td id=\"T_bc86a_row30_col0\" class=\"data row30 col0\" >Experiment Name</td>\n",
       "      <td id=\"T_bc86a_row30_col1\" class=\"data row30 col1\" >clf-default-name</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_bc86a_level0_row31\" class=\"row_heading level0 row31\" >31</th>\n",
       "      <td id=\"T_bc86a_row31_col0\" class=\"data row31 col0\" >USI</td>\n",
       "      <td id=\"T_bc86a_row31_col1\" class=\"data row31 col1\" >9e62</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n"
      ],
      "text/plain": [
       "<pandas.io.formats.style.Styler at 0x7fe6c6f72ee0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.\n",
      "Please recompile with CMake option -DUSE_GPU=1\n",
      "[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.\n",
      "Please recompile with CMake option -DUSE_CUDA=1\n",
      "[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.\n",
      "Please recompile with CMake option -DUSE_GPU=1\n",
      "[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.\n",
      "Please recompile with CMake option -DUSE_CUDA=1\n"
     ]
    }
   ],
   "source": [
    "from pycaret.classification import *\n",
    "s = setup(flow1, target = 'attack_label', normalize = True, fix_imbalance=True, use_gpu=True, session_id = 123)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "ae870e57-a2a6-4323-8caf-747595410d54",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<style type=\"text/css\">\n",
       "#T_2ddd7_row10_col0, #T_2ddd7_row10_col1, #T_2ddd7_row10_col2, #T_2ddd7_row10_col3, #T_2ddd7_row10_col4, #T_2ddd7_row10_col5, #T_2ddd7_row10_col6 {\n",
       "  background: yellow;\n",
       "}\n",
       "</style>\n",
       "<table id=\"T_2ddd7\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th class=\"blank level0\" >&nbsp;</th>\n",
       "      <th id=\"T_2ddd7_level0_col0\" class=\"col_heading level0 col0\" >Accuracy</th>\n",
       "      <th id=\"T_2ddd7_level0_col1\" class=\"col_heading level0 col1\" >AUC</th>\n",
       "      <th id=\"T_2ddd7_level0_col2\" class=\"col_heading level0 col2\" >Recall</th>\n",
       "      <th id=\"T_2ddd7_level0_col3\" class=\"col_heading level0 col3\" >Prec.</th>\n",
       "      <th id=\"T_2ddd7_level0_col4\" class=\"col_heading level0 col4\" >F1</th>\n",
       "      <th id=\"T_2ddd7_level0_col5\" class=\"col_heading level0 col5\" >Kappa</th>\n",
       "      <th id=\"T_2ddd7_level0_col6\" class=\"col_heading level0 col6\" >MCC</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th class=\"index_name level0\" >Fold</th>\n",
       "      <th class=\"blank col0\" >&nbsp;</th>\n",
       "      <th class=\"blank col1\" >&nbsp;</th>\n",
       "      <th class=\"blank col2\" >&nbsp;</th>\n",
       "      <th class=\"blank col3\" >&nbsp;</th>\n",
       "      <th class=\"blank col4\" >&nbsp;</th>\n",
       "      <th class=\"blank col5\" >&nbsp;</th>\n",
       "      <th class=\"blank col6\" >&nbsp;</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th id=\"T_2ddd7_level0_row0\" class=\"row_heading level0 row0\" >0</th>\n",
       "      <td id=\"T_2ddd7_row0_col0\" class=\"data row0 col0\" >0.9813</td>\n",
       "      <td id=\"T_2ddd7_row0_col1\" class=\"data row0 col1\" >0.9985</td>\n",
       "      <td id=\"T_2ddd7_row0_col2\" class=\"data row0 col2\" >0.9813</td>\n",
       "      <td id=\"T_2ddd7_row0_col3\" class=\"data row0 col3\" >0.9840</td>\n",
       "      <td id=\"T_2ddd7_row0_col4\" class=\"data row0 col4\" >0.9824</td>\n",
       "      <td id=\"T_2ddd7_row0_col5\" class=\"data row0 col5\" >0.8056</td>\n",
       "      <td id=\"T_2ddd7_row0_col6\" class=\"data row0 col6\" >0.8060</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_2ddd7_level0_row1\" class=\"row_heading level0 row1\" >1</th>\n",
       "      <td id=\"T_2ddd7_row1_col0\" class=\"data row1 col0\" >0.9817</td>\n",
       "      <td id=\"T_2ddd7_row1_col1\" class=\"data row1 col1\" >0.9986</td>\n",
       "      <td id=\"T_2ddd7_row1_col2\" class=\"data row1 col2\" >0.9817</td>\n",
       "      <td id=\"T_2ddd7_row1_col3\" class=\"data row1 col3\" >0.9844</td>\n",
       "      <td id=\"T_2ddd7_row1_col4\" class=\"data row1 col4\" >0.9828</td>\n",
       "      <td id=\"T_2ddd7_row1_col5\" class=\"data row1 col5\" >0.8095</td>\n",
       "      <td id=\"T_2ddd7_row1_col6\" class=\"data row1 col6\" >0.8098</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_2ddd7_level0_row2\" class=\"row_heading level0 row2\" >2</th>\n",
       "      <td id=\"T_2ddd7_row2_col0\" class=\"data row2 col0\" >0.9820</td>\n",
       "      <td id=\"T_2ddd7_row2_col1\" class=\"data row2 col1\" >0.9984</td>\n",
       "      <td id=\"T_2ddd7_row2_col2\" class=\"data row2 col2\" >0.9820</td>\n",
       "      <td id=\"T_2ddd7_row2_col3\" class=\"data row2 col3\" >0.9845</td>\n",
       "      <td id=\"T_2ddd7_row2_col4\" class=\"data row2 col4\" >0.9831</td>\n",
       "      <td id=\"T_2ddd7_row2_col5\" class=\"data row2 col5\" >0.8138</td>\n",
       "      <td id=\"T_2ddd7_row2_col6\" class=\"data row2 col6\" >0.8141</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_2ddd7_level0_row3\" class=\"row_heading level0 row3\" >3</th>\n",
       "      <td id=\"T_2ddd7_row3_col0\" class=\"data row3 col0\" >0.9820</td>\n",
       "      <td id=\"T_2ddd7_row3_col1\" class=\"data row3 col1\" >0.9983</td>\n",
       "      <td id=\"T_2ddd7_row3_col2\" class=\"data row3 col2\" >0.9820</td>\n",
       "      <td id=\"T_2ddd7_row3_col3\" class=\"data row3 col3\" >0.9846</td>\n",
       "      <td id=\"T_2ddd7_row3_col4\" class=\"data row3 col4\" >0.9831</td>\n",
       "      <td id=\"T_2ddd7_row3_col5\" class=\"data row3 col5\" >0.8131</td>\n",
       "      <td id=\"T_2ddd7_row3_col6\" class=\"data row3 col6\" >0.8134</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_2ddd7_level0_row4\" class=\"row_heading level0 row4\" >4</th>\n",
       "      <td id=\"T_2ddd7_row4_col0\" class=\"data row4 col0\" >0.9815</td>\n",
       "      <td id=\"T_2ddd7_row4_col1\" class=\"data row4 col1\" >0.9985</td>\n",
       "      <td id=\"T_2ddd7_row4_col2\" class=\"data row4 col2\" >0.9815</td>\n",
       "      <td id=\"T_2ddd7_row4_col3\" class=\"data row4 col3\" >0.9841</td>\n",
       "      <td id=\"T_2ddd7_row4_col4\" class=\"data row4 col4\" >0.9827</td>\n",
       "      <td id=\"T_2ddd7_row4_col5\" class=\"data row4 col5\" >0.8082</td>\n",
       "      <td id=\"T_2ddd7_row4_col6\" class=\"data row4 col6\" >0.8085</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_2ddd7_level0_row5\" class=\"row_heading level0 row5\" >5</th>\n",
       "      <td id=\"T_2ddd7_row5_col0\" class=\"data row5 col0\" >0.9816</td>\n",
       "      <td id=\"T_2ddd7_row5_col1\" class=\"data row5 col1\" >0.9985</td>\n",
       "      <td id=\"T_2ddd7_row5_col2\" class=\"data row5 col2\" >0.9816</td>\n",
       "      <td id=\"T_2ddd7_row5_col3\" class=\"data row5 col3\" >0.9840</td>\n",
       "      <td id=\"T_2ddd7_row5_col4\" class=\"data row5 col4\" >0.9827</td>\n",
       "      <td id=\"T_2ddd7_row5_col5\" class=\"data row5 col5\" >0.8084</td>\n",
       "      <td id=\"T_2ddd7_row5_col6\" class=\"data row5 col6\" >0.8086</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_2ddd7_level0_row6\" class=\"row_heading level0 row6\" >6</th>\n",
       "      <td id=\"T_2ddd7_row6_col0\" class=\"data row6 col0\" >0.9814</td>\n",
       "      <td id=\"T_2ddd7_row6_col1\" class=\"data row6 col1\" >0.9985</td>\n",
       "      <td id=\"T_2ddd7_row6_col2\" class=\"data row6 col2\" >0.9814</td>\n",
       "      <td id=\"T_2ddd7_row6_col3\" class=\"data row6 col3\" >0.9840</td>\n",
       "      <td id=\"T_2ddd7_row6_col4\" class=\"data row6 col4\" >0.9825</td>\n",
       "      <td id=\"T_2ddd7_row6_col5\" class=\"data row6 col5\" >0.8074</td>\n",
       "      <td id=\"T_2ddd7_row6_col6\" class=\"data row6 col6\" >0.8077</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_2ddd7_level0_row7\" class=\"row_heading level0 row7\" >7</th>\n",
       "      <td id=\"T_2ddd7_row7_col0\" class=\"data row7 col0\" >0.9809</td>\n",
       "      <td id=\"T_2ddd7_row7_col1\" class=\"data row7 col1\" >0.9985</td>\n",
       "      <td id=\"T_2ddd7_row7_col2\" class=\"data row7 col2\" >0.9809</td>\n",
       "      <td id=\"T_2ddd7_row7_col3\" class=\"data row7 col3\" >0.9837</td>\n",
       "      <td id=\"T_2ddd7_row7_col4\" class=\"data row7 col4\" >0.9822</td>\n",
       "      <td id=\"T_2ddd7_row7_col5\" class=\"data row7 col5\" >0.8027</td>\n",
       "      <td id=\"T_2ddd7_row7_col6\" class=\"data row7 col6\" >0.8031</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_2ddd7_level0_row8\" class=\"row_heading level0 row8\" >8</th>\n",
       "      <td id=\"T_2ddd7_row8_col0\" class=\"data row8 col0\" >0.9805</td>\n",
       "      <td id=\"T_2ddd7_row8_col1\" class=\"data row8 col1\" >0.9987</td>\n",
       "      <td id=\"T_2ddd7_row8_col2\" class=\"data row8 col2\" >0.9805</td>\n",
       "      <td id=\"T_2ddd7_row8_col3\" class=\"data row8 col3\" >0.9838</td>\n",
       "      <td id=\"T_2ddd7_row8_col4\" class=\"data row8 col4\" >0.9819</td>\n",
       "      <td id=\"T_2ddd7_row8_col5\" class=\"data row8 col5\" >0.8006</td>\n",
       "      <td id=\"T_2ddd7_row8_col6\" class=\"data row8 col6\" >0.8013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_2ddd7_level0_row9\" class=\"row_heading level0 row9\" >9</th>\n",
       "      <td id=\"T_2ddd7_row9_col0\" class=\"data row9 col0\" >0.9815</td>\n",
       "      <td id=\"T_2ddd7_row9_col1\" class=\"data row9 col1\" >0.9986</td>\n",
       "      <td id=\"T_2ddd7_row9_col2\" class=\"data row9 col2\" >0.9815</td>\n",
       "      <td id=\"T_2ddd7_row9_col3\" class=\"data row9 col3\" >0.9840</td>\n",
       "      <td id=\"T_2ddd7_row9_col4\" class=\"data row9 col4\" >0.9825</td>\n",
       "      <td id=\"T_2ddd7_row9_col5\" class=\"data row9 col5\" >0.8074</td>\n",
       "      <td id=\"T_2ddd7_row9_col6\" class=\"data row9 col6\" >0.8076</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_2ddd7_level0_row10\" class=\"row_heading level0 row10\" >Mean</th>\n",
       "      <td id=\"T_2ddd7_row10_col0\" class=\"data row10 col0\" >0.9815</td>\n",
       "      <td id=\"T_2ddd7_row10_col1\" class=\"data row10 col1\" >0.9985</td>\n",
       "      <td id=\"T_2ddd7_row10_col2\" class=\"data row10 col2\" >0.9815</td>\n",
       "      <td id=\"T_2ddd7_row10_col3\" class=\"data row10 col3\" >0.9841</td>\n",
       "      <td id=\"T_2ddd7_row10_col4\" class=\"data row10 col4\" >0.9826</td>\n",
       "      <td id=\"T_2ddd7_row10_col5\" class=\"data row10 col5\" >0.8077</td>\n",
       "      <td id=\"T_2ddd7_row10_col6\" class=\"data row10 col6\" >0.8080</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_2ddd7_level0_row11\" class=\"row_heading level0 row11\" >Std</th>\n",
       "      <td id=\"T_2ddd7_row11_col0\" class=\"data row11 col0\" >0.0004</td>\n",
       "      <td id=\"T_2ddd7_row11_col1\" class=\"data row11 col1\" >0.0001</td>\n",
       "      <td id=\"T_2ddd7_row11_col2\" class=\"data row11 col2\" >0.0004</td>\n",
       "      <td id=\"T_2ddd7_row11_col3\" class=\"data row11 col3\" >0.0003</td>\n",
       "      <td id=\"T_2ddd7_row11_col4\" class=\"data row11 col4\" >0.0004</td>\n",
       "      <td id=\"T_2ddd7_row11_col5\" class=\"data row11 col5\" >0.0039</td>\n",
       "      <td id=\"T_2ddd7_row11_col6\" class=\"data row11 col6\" >0.0038</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n"
      ],
      "text/plain": [
       "<pandas.io.formats.style.Styler at 0x7fe6c6f53040>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "rf = create_model('rf')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "14f64d29-e32d-47ad-8bc8-7afd8df7506f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "cd7cbc53df0146709956c9ffe34f3fa8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "evaluate_model(rf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "ab6a6ea0-1017-4dea-8581-05e67f6a5031",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.23158914, 0.1070963 , 0.17587187, 0.05399162, 0.04281908,\n",
       "       0.03992106, 0.09035387, 0.13516337, 0.12319368])"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rf.feature_importances_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "bb2e07ad-2c72-4eff-a0d1-1fa90867e5ee",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>flow_id</th>\n",
       "      <th>source_ip</th>\n",
       "      <th>source_port</th>\n",
       "      <th>destination_ip</th>\n",
       "      <th>destination_port</th>\n",
       "      <th>protocol</th>\n",
       "      <th>state</th>\n",
       "      <th>dur</th>\n",
       "      <th>sbytes</th>\n",
       "      <th>dbytes</th>\n",
       "      <th>...</th>\n",
       "      <th>ct_ftp_cmd</th>\n",
       "      <th>ct_srv_src</th>\n",
       "      <th>ct_srv_dst</th>\n",
       "      <th>ct_dst_ltm</th>\n",
       "      <th>ct_src_ltm</th>\n",
       "      <th>ct_src_dport_ltm</th>\n",
       "      <th>ct_dst_sport_ltm</th>\n",
       "      <th>ct_dst_src_ltm</th>\n",
       "      <th>attack_label</th>\n",
       "      <th>binary_label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>1390</td>\n",
       "      <td>149.171.126.6</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.001055</td>\n",
       "      <td>132</td>\n",
       "      <td>164</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>33661</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>1024</td>\n",
       "      <td>udp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.036133</td>\n",
       "      <td>528</td>\n",
       "      <td>304</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>1464</td>\n",
       "      <td>149.171.126.7</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.001119</td>\n",
       "      <td>146</td>\n",
       "      <td>178</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>8</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>3593</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.001209</td>\n",
       "      <td>132</td>\n",
       "      <td>164</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>49664</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.001169</td>\n",
       "      <td>146</td>\n",
       "      <td>178</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059410</th>\n",
       "      <td>2059411</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.564998</td>\n",
       "      <td>14106</td>\n",
       "      <td>772406</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059411</th>\n",
       "      <td>2059412</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.564998</td>\n",
       "      <td>14106</td>\n",
       "      <td>772406</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059412</th>\n",
       "      <td>2059413</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>33094</td>\n",
       "      <td>149.171.126.7</td>\n",
       "      <td>43433</td>\n",
       "      <td>tcp</td>\n",
       "      <td>FIN</td>\n",
       "      <td>0.087306</td>\n",
       "      <td>320</td>\n",
       "      <td>1828</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059413</th>\n",
       "      <td>2059414</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35433</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "      <td>CON</td>\n",
       "      <td>2.200934</td>\n",
       "      <td>3498</td>\n",
       "      <td>166054</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>normal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059414</th>\n",
       "      <td>2059415</td>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>17293</td>\n",
       "      <td>149.171.126.17</td>\n",
       "      <td>110</td>\n",
       "      <td>tcp</td>\n",
       "      <td>CON</td>\n",
       "      <td>0.942984</td>\n",
       "      <td>574</td>\n",
       "      <td>676</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>exploits</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2059415 rows × 50 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         flow_id     source_ip  source_port  destination_ip  destination_port  \\\n",
       "0              1    59.166.0.0         1390   149.171.126.6                53   \n",
       "1              2    59.166.0.0        33661   149.171.126.9              1024   \n",
       "2              3    59.166.0.6         1464   149.171.126.7                53   \n",
       "3              4    59.166.0.5         3593   149.171.126.5                53   \n",
       "4              5    59.166.0.3        49664   149.171.126.0                53   \n",
       "...          ...           ...          ...             ...               ...   \n",
       "2059410  2059411    59.166.0.1        38606   149.171.126.9                80   \n",
       "2059411  2059412    59.166.0.1        38606   149.171.126.9                80   \n",
       "2059412  2059413    59.166.0.5        33094   149.171.126.7             43433   \n",
       "2059413  2059414    59.166.0.9        35433   149.171.126.0                80   \n",
       "2059414  2059415  175.45.176.0        17293  149.171.126.17               110   \n",
       "\n",
       "        protocol state       dur  sbytes  dbytes  ...  ct_ftp_cmd  ct_srv_src  \\\n",
       "0            udp   CON  0.001055     132     164  ...           0           3   \n",
       "1            udp   CON  0.036133     528     304  ...           0           2   \n",
       "2            udp   CON  0.001119     146     178  ...           0          12   \n",
       "3            udp   CON  0.001209     132     164  ...           0           6   \n",
       "4            udp   CON  0.001169     146     178  ...           0           7   \n",
       "...          ...   ...       ...     ...     ...  ...         ...         ...   \n",
       "2059410      tcp   CON  0.564998   14106  772406  ...           0           1   \n",
       "2059411      tcp   CON  0.564998   14106  772406  ...           0           2   \n",
       "2059412      tcp   FIN  0.087306     320    1828  ...           0           1   \n",
       "2059413      tcp   CON  2.200934    3498  166054  ...           0           1   \n",
       "2059414      tcp   CON  0.942984     574     676  ...           0           1   \n",
       "\n",
       "         ct_srv_dst  ct_dst_ltm ct_src_ltm  ct_src_dport_ltm  \\\n",
       "0                 7           1          3                 1   \n",
       "1                 4           2          3                 1   \n",
       "2                 8           1          2                 2   \n",
       "3                 9           1          1                 1   \n",
       "4                 9           1          1                 1   \n",
       "...             ...         ...        ...               ...   \n",
       "2059410           1           4          2                 2   \n",
       "2059411           1           4          2                 2   \n",
       "2059412           2           3          3                 1   \n",
       "2059413           1           2          4                 2   \n",
       "2059414           1           2          4                 2   \n",
       "\n",
       "         ct_dst_sport_ltm  ct_dst_src_ltm  attack_label  binary_label  \n",
       "0                       1               1        normal             0  \n",
       "1                       1               2        normal             0  \n",
       "2                       1               1        normal             0  \n",
       "3                       1               1        normal             0  \n",
       "4                       1               1        normal             0  \n",
       "...                   ...             ...           ...           ...  \n",
       "2059410                 2               2        normal             0  \n",
       "2059411                 2               2        normal             0  \n",
       "2059412                 1               3        normal             0  \n",
       "2059413                 2               2        normal             0  \n",
       "2059414                 2               2      exploits             1  \n",
       "\n",
       "[2059415 rows x 50 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "flow1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "e9a50d9d-ed80-4df3-909c-506d90b0b110",
   "metadata": {},
   "outputs": [],
   "source": [
    "flow1 = flow1[['source_ip', 'source_port', 'destination_ip', 'destination_port', 'protocol', 'dur', 'sbytes', 'dbytes', 'sload', 'dload', 'spkts', 'dpkts', 'attack_label']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "37cba272-92cf-40a8-8815-c14d0f8c6787",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_ip</th>\n",
       "      <th>source_port</th>\n",
       "      <th>destination_ip</th>\n",
       "      <th>destination_port</th>\n",
       "      <th>protocol</th>\n",
       "      <th>dur</th>\n",
       "      <th>sbytes</th>\n",
       "      <th>dbytes</th>\n",
       "      <th>sload</th>\n",
       "      <th>dload</th>\n",
       "      <th>spkts</th>\n",
       "      <th>dpkts</th>\n",
       "      <th>attack_label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>1390</td>\n",
       "      <td>149.171.126.6</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "      <td>0.001055</td>\n",
       "      <td>132</td>\n",
       "      <td>164</td>\n",
       "      <td>500473.937500</td>\n",
       "      <td>6.218009e+05</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>33661</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>1024</td>\n",
       "      <td>udp</td>\n",
       "      <td>0.036133</td>\n",
       "      <td>528</td>\n",
       "      <td>304</td>\n",
       "      <td>87676.085940</td>\n",
       "      <td>5.048017e+04</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>1464</td>\n",
       "      <td>149.171.126.7</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "      <td>0.001119</td>\n",
       "      <td>146</td>\n",
       "      <td>178</td>\n",
       "      <td>521894.531300</td>\n",
       "      <td>6.362824e+05</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>3593</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "      <td>0.001209</td>\n",
       "      <td>132</td>\n",
       "      <td>164</td>\n",
       "      <td>436724.562500</td>\n",
       "      <td>5.425972e+05</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>49664</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>53</td>\n",
       "      <td>udp</td>\n",
       "      <td>0.001169</td>\n",
       "      <td>146</td>\n",
       "      <td>178</td>\n",
       "      <td>499572.250000</td>\n",
       "      <td>6.090676e+05</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059410</th>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "      <td>0.564998</td>\n",
       "      <td>14106</td>\n",
       "      <td>772406</td>\n",
       "      <td>198981.250000</td>\n",
       "      <td>1.091598e+07</td>\n",
       "      <td>262</td>\n",
       "      <td>526</td>\n",
       "      <td>normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059411</th>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "      <td>0.564998</td>\n",
       "      <td>14106</td>\n",
       "      <td>772406</td>\n",
       "      <td>198981.250000</td>\n",
       "      <td>1.091598e+07</td>\n",
       "      <td>262</td>\n",
       "      <td>526</td>\n",
       "      <td>normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059412</th>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>33094</td>\n",
       "      <td>149.171.126.7</td>\n",
       "      <td>43433</td>\n",
       "      <td>tcp</td>\n",
       "      <td>0.087306</td>\n",
       "      <td>320</td>\n",
       "      <td>1828</td>\n",
       "      <td>24465.671880</td>\n",
       "      <td>1.466108e+05</td>\n",
       "      <td>6</td>\n",
       "      <td>8</td>\n",
       "      <td>normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059413</th>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35433</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "      <td>2.200934</td>\n",
       "      <td>3498</td>\n",
       "      <td>166054</td>\n",
       "      <td>12496.513670</td>\n",
       "      <td>5.983751e+05</td>\n",
       "      <td>58</td>\n",
       "      <td>116</td>\n",
       "      <td>normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2059414</th>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>17293</td>\n",
       "      <td>149.171.126.17</td>\n",
       "      <td>110</td>\n",
       "      <td>tcp</td>\n",
       "      <td>0.942984</td>\n",
       "      <td>574</td>\n",
       "      <td>676</td>\n",
       "      <td>4470.913574</td>\n",
       "      <td>5.259898e+03</td>\n",
       "      <td>12</td>\n",
       "      <td>12</td>\n",
       "      <td>exploits</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2059415 rows × 13 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            source_ip  source_port  destination_ip  destination_port protocol  \\\n",
       "0          59.166.0.0         1390   149.171.126.6                53      udp   \n",
       "1          59.166.0.0        33661   149.171.126.9              1024      udp   \n",
       "2          59.166.0.6         1464   149.171.126.7                53      udp   \n",
       "3          59.166.0.5         3593   149.171.126.5                53      udp   \n",
       "4          59.166.0.3        49664   149.171.126.0                53      udp   \n",
       "...               ...          ...             ...               ...      ...   \n",
       "2059410    59.166.0.1        38606   149.171.126.9                80      tcp   \n",
       "2059411    59.166.0.1        38606   149.171.126.9                80      tcp   \n",
       "2059412    59.166.0.5        33094   149.171.126.7             43433      tcp   \n",
       "2059413    59.166.0.9        35433   149.171.126.0                80      tcp   \n",
       "2059414  175.45.176.0        17293  149.171.126.17               110      tcp   \n",
       "\n",
       "              dur  sbytes  dbytes          sload         dload  spkts  dpkts  \\\n",
       "0        0.001055     132     164  500473.937500  6.218009e+05      2      2   \n",
       "1        0.036133     528     304   87676.085940  5.048017e+04      4      4   \n",
       "2        0.001119     146     178  521894.531300  6.362824e+05      2      2   \n",
       "3        0.001209     132     164  436724.562500  5.425972e+05      2      2   \n",
       "4        0.001169     146     178  499572.250000  6.090676e+05      2      2   \n",
       "...           ...     ...     ...            ...           ...    ...    ...   \n",
       "2059410  0.564998   14106  772406  198981.250000  1.091598e+07    262    526   \n",
       "2059411  0.564998   14106  772406  198981.250000  1.091598e+07    262    526   \n",
       "2059412  0.087306     320    1828   24465.671880  1.466108e+05      6      8   \n",
       "2059413  2.200934    3498  166054   12496.513670  5.983751e+05     58    116   \n",
       "2059414  0.942984     574     676    4470.913574  5.259898e+03     12     12   \n",
       "\n",
       "        attack_label  \n",
       "0             normal  \n",
       "1             normal  \n",
       "2             normal  \n",
       "3             normal  \n",
       "4             normal  \n",
       "...              ...  \n",
       "2059410       normal  \n",
       "2059411       normal  \n",
       "2059412       normal  \n",
       "2059413       normal  \n",
       "2059414     exploits  \n",
       "\n",
       "[2059415 rows x 13 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "flow1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "44e57b16-76de-4cd6-920d-8b78a3f54939",
   "metadata": {},
   "outputs": [],
   "source": [
    "flow1.to_csv('./unsw_flow1.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
